Merge tag 'kbuild-fixes-v6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/masahi...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 19 Nov 2023 21:54:28 +0000 (13:54 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 19 Nov 2023 21:54:28 +0000 (13:54 -0800)
Pull Kbuild fixes from Masahiro Yamada:

 - Fix section mismatch warning messages for riscv and loongarch

 - Remove CONFIG_IA64 left-over from linux/export-internal.h

 - Fix the location of the quotes for UIMAGE_NAME

 - Fix a memory leak bug in Kconfig

* tag 'kbuild-fixes-v6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild:
  kconfig: fix memory leak from range properties
  kbuild: Move the single quotes for image name
  linux/export: clean up the IA-64 KSYM_FUNC macro
  modpost: fix section mismatch message for RELA

245 files changed:
Documentation/arch/x86/boot.rst
Documentation/devicetree/bindings/net/ethernet-controller.yaml
Documentation/devicetree/bindings/ufs/qcom,ufs.yaml
MAINTAINERS
arch/parisc/Kconfig
arch/parisc/include/asm/elf.h
arch/parisc/include/asm/processor.h
arch/parisc/kernel/processor.c
arch/parisc/kernel/sys_parisc.c
arch/x86/include/asm/acpi.h
arch/x86/include/asm/xen/hypervisor.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/signal_64.c
block/blk-mq.c
drivers/accel/ivpu/ivpu_pm.c
drivers/firmware/Kconfig
drivers/firmware/qemu_fw_cfg.c
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
drivers/gpu/drm/amd/display/dc/core/dc.c
drivers/gpu/drm/amd/display/dc/core/dc_resource.c
drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
drivers/gpu/drm/amd/display/dc/dc_types.h
drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c
drivers/gpu/drm/amd/display/dc/link/link_detection.c
drivers/gpu/drm/amd/display/dmub/dmub_srv.h
drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c
drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
drivers/gpu/drm/ci/xfails/requirements.txt
drivers/gpu/drm/drm_panel_orientation_quirks.c
drivers/gpu/drm/nouveau/include/nvkm/core/event.h
drivers/gpu/drm/nouveau/nouveau_display.c
drivers/gpu/drm/nouveau/nvkm/core/event.c
drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
drivers/i2c/busses/i2c-designware-common.c
drivers/i2c/busses/i2c-ocores.c
drivers/i2c/busses/i2c-pxa.c
drivers/irqchip/irq-gic-v3-its.c
drivers/md/dm-bufio.c
drivers/md/dm-crypt.c
drivers/md/dm-delay.c
drivers/md/dm-verity-fec.c
drivers/md/dm-verity-target.c
drivers/md/dm-verity.h
drivers/net/bonding/bond_main.c
drivers/net/ethernet/amd/pds_core/adminq.c
drivers/net/ethernet/amd/pds_core/core.h
drivers/net/ethernet/amd/pds_core/dev.c
drivers/net/ethernet/amd/pds_core/devlink.c
drivers/net/ethernet/broadcom/tg3.c
drivers/net/ethernet/broadcom/tg3.h
drivers/net/ethernet/cortina/gemini.c
drivers/net/ethernet/cortina/gemini.h
drivers/net/ethernet/google/gve/gve_main.c
drivers/net/ethernet/google/gve/gve_rx.c
drivers/net/ethernet/google/gve/gve_tx.c
drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c
drivers/net/ethernet/intel/ice/ice_ddp.c
drivers/net/ethernet/intel/ice/ice_dpll.c
drivers/net/ethernet/intel/ice/ice_dpll.h
drivers/net/ethernet/intel/ice/ice_ptp_hw.c
drivers/net/ethernet/intel/ice/ice_ptp_hw.h
drivers/net/ethernet/marvell/mvneta.c
drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
drivers/net/ethernet/mellanox/mlx5/core/eq.c
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c
drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
drivers/net/ethernet/realtek/r8169_main.c
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
drivers/net/ethernet/ti/icssg/icssg_prueth.c
drivers/net/ipvlan/ipvlan_core.c
drivers/net/macvlan.c
drivers/net/ppp/ppp_synctty.c
drivers/parisc/power.c
drivers/ptp/ptp_chardev.c
drivers/ptp/ptp_clock.c
drivers/ptp/ptp_private.h
drivers/ptp/ptp_sysfs.c
drivers/scsi/qla2xxx/qla_os.c
drivers/scsi/scsi_debug.c
drivers/scsi/sd.c
drivers/ufs/core/ufs-mcq.c
drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
drivers/vhost/vdpa.c
drivers/virtio/virtio_pci_common.c
drivers/virtio/virtio_pci_modern_dev.c
drivers/xen/events/events_2l.c
drivers/xen/events/events_base.c
drivers/xen/events/events_internal.h
drivers/xen/pcpu.c
drivers/xen/xen-front-pgdir-shbuf.c
fs/bcachefs/backpointers.c
fs/bcachefs/bcachefs.h
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_key_cache.c
fs/bcachefs/btree_key_cache_types.h [new file with mode: 0644]
fs/bcachefs/btree_trans_commit.c
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_interior.h
fs/bcachefs/data_update.c
fs/bcachefs/disk_groups.c
fs/bcachefs/ec.c
fs/bcachefs/fs-io-pagecache.c
fs/bcachefs/fs-io-pagecache.h
fs/bcachefs/fs.c
fs/bcachefs/fsck.c
fs/bcachefs/inode.c
fs/bcachefs/io_write.c
fs/bcachefs/journal.c
fs/bcachefs/journal.h
fs/bcachefs/journal_io.c
fs/bcachefs/journal_reclaim.c
fs/bcachefs/journal_types.h
fs/bcachefs/six.c
fs/bcachefs/subvolume_types.h
fs/bcachefs/trace.h
fs/bcachefs/xattr.c
fs/btrfs/ctree.c
fs/btrfs/delayed-ref.c
fs/btrfs/extent-tree.c
fs/btrfs/extent-tree.h
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/qgroup.c
fs/btrfs/raid-stripe-tree.c
fs/btrfs/scrub.c
fs/btrfs/volumes.c
fs/btrfs/zoned.c
fs/nfsd/cache.h
fs/nfsd/nfs4state.c
fs/nfsd/nfscache.c
fs/nfsd/nfssvc.c
fs/overlayfs/params.c
fs/overlayfs/util.c
fs/smb/client/cifs_spnego.c
fs/smb/client/connect.c
fs/smb/client/sess.c
fs/smb/client/smb2transport.c
fs/xfs/Kconfig
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_defer.c
fs/xfs/libxfs/xfs_defer.h
fs/xfs/libxfs/xfs_inode_buf.c
fs/xfs/xfs_inode_item_recover.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_reflink.c
include/linux/bpf.h
include/linux/cpuhotplug.h
include/linux/hrtimer.h
include/linux/mdio.h
include/linux/perf_event.h
include/linux/stackleak.h
include/linux/virtio_pci_modern.h
include/net/netfilter/nf_tables.h
include/net/tc_act/tc_ct.h
include/uapi/linux/btrfs_tree.h
include/uapi/linux/virtio_pci.h
include/xen/events.h
io_uring/fdinfo.c
io_uring/sqpoll.c
kernel/audit_watch.c
kernel/bpf/core.c
kernel/bpf/verifier.c
kernel/cgroup/cgroup.c
kernel/cpu.c
kernel/events/core.c
kernel/futex/core.c
kernel/sched/fair.c
kernel/sys.c
kernel/time/hrtimer.c
lib/zstd/common/fse_decompress.c
mm/damon/core.c
mm/damon/sysfs-schemes.c
mm/damon/sysfs.c
mm/filemap.c
mm/huge_memory.c
mm/ksm.c
mm/memcontrol.c
mm/userfaultfd.c
mm/util.c
net/bridge/netfilter/nf_conntrack_bridge.c
net/core/dev.c
net/core/gso_test.c
net/ipv4/inet_hashtables.c
net/mptcp/pm_netlink.c
net/mptcp/protocol.c
net/mptcp/sockopt.c
net/ncsi/ncsi-aen.c
net/netfilter/ipset/ip_set_core.c
net/netfilter/nf_tables_api.c
net/netfilter/nft_byteorder.c
net/netfilter/nft_meta.c
net/netfilter/nft_set_rbtree.c
net/sched/act_ct.c
net/tipc/netlink_compat.c
net/unix/af_unix.c
scripts/gcc-plugins/latent_entropy_plugin.c
scripts/gcc-plugins/randomize_layout_plugin.c
sound/pci/hda/cs35l56_hda_i2c.c
sound/pci/hda/hda_intel.c
sound/pci/hda/patch_realtek.c
tools/power/x86/turbostat/turbostat.c
tools/testing/selftests/bpf/progs/verifier_cfg.c
tools/testing/selftests/bpf/progs/verifier_loops1.c
tools/testing/selftests/bpf/progs/verifier_precision.c
tools/testing/selftests/bpf/verifier/calls.c
tools/testing/selftests/bpf/verifier/ld_imm64.c
tools/testing/selftests/bpf/xskxceiver.c
tools/testing/selftests/mm/.gitignore
tools/testing/selftests/mm/pagemap_ioctl.c
tools/testing/selftests/mm/run_vmtests.sh
tools/testing/selftests/net/mptcp/mptcp_join.sh

index f5d2f24..22cc7a0 100644 (file)
@@ -77,7 +77,7 @@ Protocol 2.14 BURNT BY INCORRECT COMMIT
 Protocol 2.15  (Kernel 5.5) Added the kernel_info and kernel_info.setup_type_max.
 =============  ============================================================
 
-.. note::
+  .. note::
      The protocol version number should be changed only if the setup header
      is changed. There is no need to update the version number if boot_params
      or kernel_info are changed. Additionally, it is recommended to use
index 9f6a5cc..d14d123 100644 (file)
@@ -275,12 +275,12 @@ allOf:
       properties:
         rx-internal-delay-ps:
           description:
-            RGMII Receive Clock Delay defined in pico seconds.This is used for
+            RGMII Receive Clock Delay defined in pico seconds. This is used for
             controllers that have configurable RX internal delays. If this
             property is present then the MAC applies the RX delay.
         tx-internal-delay-ps:
           description:
-            RGMII Transmit Clock Delay defined in pico seconds.This is used for
+            RGMII Transmit Clock Delay defined in pico seconds. This is used for
             controllers that have configurable TX internal delays. If this
             property is present then the MAC applies the TX delay.
 
index 462ead5..2cf3d01 100644 (file)
@@ -36,6 +36,7 @@ properties:
           - qcom,sm8350-ufshc
           - qcom,sm8450-ufshc
           - qcom,sm8550-ufshc
+          - qcom,sm8650-ufshc
       - const: qcom,ufshc
       - const: jedec,ufs-2.0
 
@@ -122,6 +123,7 @@ allOf:
               - qcom,sm8350-ufshc
               - qcom,sm8450-ufshc
               - qcom,sm8550-ufshc
+              - qcom,sm8650-ufshc
     then:
       properties:
         clocks:
index 97f51d5..ea79014 100644 (file)
@@ -8950,7 +8950,6 @@ S:        Maintained
 F:     scripts/get_maintainer.pl
 
 GFS2 FILE SYSTEM
-M:     Bob Peterson <rpeterso@redhat.com>
 M:     Andreas Gruenbacher <agruenba@redhat.com>
 L:     gfs2@lists.linux.dev
 S:     Supported
@@ -21769,7 +21768,9 @@ F:      Documentation/devicetree/bindings/counter/ti-eqep.yaml
 F:     drivers/counter/ti-eqep.c
 
 TI ETHERNET SWITCH DRIVER (CPSW)
-R:     Grygorii Strashko <grygorii.strashko@ti.com>
+R:     Siddharth Vadapalli <s-vadapalli@ti.com>
+R:     Ravi Gunasekaran <r-gunasekaran@ti.com>
+R:     Roger Quadros <rogerq@kernel.org>
 L:     linux-omap@vger.kernel.org
 L:     netdev@vger.kernel.org
 S:     Maintained
@@ -21793,6 +21794,15 @@ F:     Documentation/devicetree/bindings/media/i2c/ti,ds90*
 F:     drivers/media/i2c/ds90*
 F:     include/media/i2c/ds90*
 
+TI ICSSG ETHERNET DRIVER (ICSSG)
+R:     MD Danish Anwar <danishanwar@ti.com>
+R:     Roger Quadros <rogerq@kernel.org>
+L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+L:     netdev@vger.kernel.org
+S:     Maintained
+F:     Documentation/devicetree/bindings/net/ti,icss*.yaml
+F:     drivers/net/ethernet/ti/icssg/*
+
 TI J721E CSI2RX DRIVER
 M:     Jai Luthra <j-luthra@ti.com>
 L:     linux-media@vger.kernel.org
@@ -23692,6 +23702,20 @@ F:     arch/x86/kernel/dumpstack.c
 F:     arch/x86/kernel/stacktrace.c
 F:     arch/x86/kernel/unwind_*.c
 
+X86 TRUST DOMAIN EXTENSIONS (TDX)
+M:     Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+R:     Dave Hansen <dave.hansen@linux.intel.com>
+L:     x86@kernel.org
+L:     linux-coco@lists.linux.dev
+S:     Supported
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/tdx
+F:     arch/x86/boot/compressed/tdx*
+F:     arch/x86/coco/tdx/
+F:     arch/x86/include/asm/shared/tdx.h
+F:     arch/x86/include/asm/tdx.h
+F:     arch/x86/virt/vmx/tdx/
+F:     drivers/virt/coco/tdx-guest
+
 X86 VDSO
 M:     Andy Lutomirski <luto@kernel.org>
 L:     linux-kernel@vger.kernel.org
@@ -23872,8 +23896,7 @@ T:      git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git
 P:     Documentation/filesystems/xfs-maintainer-entry-profile.rst
 F:     Documentation/ABI/testing/sysfs-fs-xfs
 F:     Documentation/admin-guide/xfs.rst
-F:     Documentation/filesystems/xfs-delayed-logging-design.rst
-F:     Documentation/filesystems/xfs-self-describing-metadata.rst
+F:     Documentation/filesystems/xfs-*
 F:     fs/xfs/
 F:     include/uapi/linux/dqblk_xfs.h
 F:     include/uapi/linux/fsmap.h
index fd69dfa..a7c9c0e 100644 (file)
@@ -140,11 +140,11 @@ config ARCH_MMAP_RND_COMPAT_BITS_MIN
        default 8
 
 config ARCH_MMAP_RND_BITS_MAX
-       default 24 if 64BIT
-       default 17
+       default 18 if 64BIT
+       default 13
 
 config ARCH_MMAP_RND_COMPAT_BITS_MAX
-       default 17
+       default 13
 
 # unless you want to implement ACPI on PA-RISC ... ;-)
 config PM
index 140eaa9..2d73d3c 100644 (file)
@@ -349,15 +349,7 @@ struct pt_regs;    /* forward declaration... */
 
 #define ELF_HWCAP      0
 
-/* Masks for stack and mmap randomization */
-#define BRK_RND_MASK   (is_32bit_task() ? 0x07ffUL : 0x3ffffUL)
-#define MMAP_RND_MASK  (is_32bit_task() ? 0x1fffUL : 0x3ffffUL)
-#define STACK_RND_MASK MMAP_RND_MASK
-
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *);
-#define arch_randomize_brk arch_randomize_brk
-
+#define STACK_RND_MASK 0x7ff   /* 8MB of VA */
 
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
 struct linux_binprm;
index c05d121..982aca2 100644 (file)
@@ -47,6 +47,8 @@
 
 #ifndef __ASSEMBLY__
 
+struct rlimit;
+unsigned long mmap_upper_limit(struct rlimit *rlim_stack);
 unsigned long calc_max_stack_size(unsigned long stack_max);
 
 /*
index 29e2750..e95a977 100644 (file)
@@ -383,7 +383,7 @@ show_cpuinfo (struct seq_file *m, void *v)
        char cpu_name[60], *p;
 
        /* strip PA path from CPU name to not confuse lscpu */
-       strlcpy(cpu_name, per_cpu(cpu_data, 0).dev->name, sizeof(cpu_name));
+       strscpy(cpu_name, per_cpu(cpu_data, 0).dev->name, sizeof(cpu_name));
        p = strrchr(cpu_name, '[');
        if (p)
                *(--p) = 0;
index ab896ef..98af719 100644 (file)
@@ -77,7 +77,7 @@ unsigned long calc_max_stack_size(unsigned long stack_max)
  * indicating that "current" should be used instead of a passed-in
  * value from the exec bprm as done with arch_pick_mmap_layout().
  */
-static unsigned long mmap_upper_limit(struct rlimit *rlim_stack)
+unsigned long mmap_upper_limit(struct rlimit *rlim_stack)
 {
        unsigned long stack_base;
 
index c8a7fc2..f896eed 100644 (file)
@@ -16,6 +16,9 @@
 #include <asm/x86_init.h>
 #include <asm/cpufeature.h>
 #include <asm/irq_vectors.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
 
 #ifdef CONFIG_ACPI_APEI
 # include <asm/pgtable_types.h>
@@ -127,6 +130,17 @@ static inline void arch_acpi_set_proc_cap_bits(u32 *cap)
        if (!cpu_has(c, X86_FEATURE_MWAIT) ||
            boot_option_idle_override == IDLE_NOMWAIT)
                *cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH);
+
+       if (xen_initial_domain()) {
+               /*
+                * When Linux is running as Xen dom0, the hypervisor is the
+                * entity in charge of the processor power management, and so
+                * Xen needs to check the OS capabilities reported in the
+                * processor capabilities buffer matches what the hypervisor
+                * driver supports.
+                */
+               xen_sanitize_proc_cap_bits(cap);
+       }
 }
 
 static inline bool acpi_has_cpu_in_madt(void)
index 7048dfa..a908825 100644 (file)
@@ -100,4 +100,13 @@ static inline void leave_lazy(enum xen_lazy_mode mode)
 
 enum xen_lazy_mode xen_get_lazy_mode(void);
 
+#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI)
+void xen_sanitize_proc_cap_bits(uint32_t *buf);
+#else
+static inline void xen_sanitize_proc_cap_bits(uint32_t *buf)
+{
+       BUG();
+}
+#endif
+
 #endif /* _ASM_X86_XEN_HYPERVISOR_H */
index d0918a7..1a0dd80 100644 (file)
@@ -63,6 +63,7 @@ int acpi_fix_pin2_polarity __initdata;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+static bool has_lapic_cpus __initdata;
 static bool acpi_support_online_capable;
 #endif
 
@@ -232,6 +233,14 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
        if (!acpi_is_processor_usable(processor->lapic_flags))
                return 0;
 
+       /*
+        * According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure
+        * when MADT provides both valid LAPIC and x2APIC entries, the APIC ID
+        * in x2APIC must be equal or greater than 0xff.
+        */
+       if (has_lapic_cpus && apic_id < 0xff)
+               return 0;
+
        /*
         * We need to register disabled CPU as well to permit
         * counting disabled CPUs. This allows us to size
@@ -1114,10 +1123,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
 
 static int __init acpi_parse_madt_lapic_entries(void)
 {
-       int count;
-       int x2count = 0;
-       int ret;
-       struct acpi_subtable_proc madt_proc[2];
+       int count, x2count = 0;
 
        if (!boot_cpu_has(X86_FEATURE_APIC))
                return -ENODEV;
@@ -1126,21 +1132,11 @@ static int __init acpi_parse_madt_lapic_entries(void)
                                      acpi_parse_sapic, MAX_LOCAL_APIC);
 
        if (!count) {
-               memset(madt_proc, 0, sizeof(madt_proc));
-               madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
-               madt_proc[0].handler = acpi_parse_lapic;
-               madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
-               madt_proc[1].handler = acpi_parse_x2apic;
-               ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
-                               sizeof(struct acpi_table_madt),
-                               madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
-               if (ret < 0) {
-                       pr_err("Error parsing LAPIC/X2APIC entries\n");
-                       return ret;
-               }
-
-               count = madt_proc[0].count;
-               x2count = madt_proc[1].count;
+               count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
+                                       acpi_parse_lapic, MAX_LOCAL_APIC);
+               has_lapic_cpus = count > 0;
+               x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
+                                       acpi_parse_x2apic, MAX_LOCAL_APIC);
        }
        if (!count && !x2count) {
                pr_err("No LAPIC entries present\n");
index cacf2ed..23d8aaf 100644 (file)
@@ -175,9 +175,6 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
        frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
        uc_flags = frame_uc_flags(regs);
 
-       if (setup_signal_shadow_stack(ksig))
-               return -EFAULT;
-
        if (!user_access_begin(frame, sizeof(*frame)))
                return -EFAULT;
 
@@ -198,6 +195,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
                        return -EFAULT;
        }
 
+       if (setup_signal_shadow_stack(ksig))
+               return -EFAULT;
+
        /* Set up registers for signal handler */
        regs->di = ksig->sig;
        /* In case the signal handler was declared without prototypes */
index e2d1118..900c1be 100644 (file)
@@ -2858,11 +2858,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
        };
        struct request *rq;
 
-       if (unlikely(bio_queue_enter(bio)))
-               return NULL;
-
        if (blk_mq_attempt_bio_merge(q, bio, nsegs))
-               goto queue_exit;
+               return NULL;
 
        rq_qos_throttle(q, bio);
 
@@ -2878,35 +2875,23 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
        rq_qos_cleanup(q, bio);
        if (bio->bi_opf & REQ_NOWAIT)
                bio_wouldblock_error(bio);
-queue_exit:
-       blk_queue_exit(q);
        return NULL;
 }
 
-static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
-               struct blk_plug *plug, struct bio **bio, unsigned int nsegs)
+/* return true if this @rq can be used for @bio */
+static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
+               struct bio *bio)
 {
-       struct request *rq;
-       enum hctx_type type, hctx_type;
+       enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
+       enum hctx_type hctx_type = rq->mq_hctx->type;
 
-       if (!plug)
-               return NULL;
-       rq = rq_list_peek(&plug->cached_rq);
-       if (!rq || rq->q != q)
-               return NULL;
+       WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
 
-       if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) {
-               *bio = NULL;
-               return NULL;
-       }
-
-       type = blk_mq_get_hctx_type((*bio)->bi_opf);
-       hctx_type = rq->mq_hctx->type;
        if (type != hctx_type &&
            !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT))
-               return NULL;
-       if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf))
-               return NULL;
+               return false;
+       if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
+               return false;
 
        /*
         * If any qos ->throttle() end up blocking, we will have flushed the
@@ -2914,12 +2899,12 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
         * before we throttle.
         */
        plug->cached_rq = rq_list_next(rq);
-       rq_qos_throttle(q, *bio);
+       rq_qos_throttle(rq->q, bio);
 
        blk_mq_rq_time_init(rq, 0);
-       rq->cmd_flags = (*bio)->bi_opf;
+       rq->cmd_flags = bio->bi_opf;
        INIT_LIST_HEAD(&rq->queuelist);
-       return rq;
+       return true;
 }
 
 static void bio_set_ioprio(struct bio *bio)
@@ -2949,7 +2934,7 @@ void blk_mq_submit_bio(struct bio *bio)
        struct blk_plug *plug = blk_mq_plug(bio);
        const int is_sync = op_is_sync(bio->bi_opf);
        struct blk_mq_hw_ctx *hctx;
-       struct request *rq;
+       struct request *rq = NULL;
        unsigned int nr_segs = 1;
        blk_status_t ret;
 
@@ -2960,20 +2945,36 @@ void blk_mq_submit_bio(struct bio *bio)
                        return;
        }
 
-       if (!bio_integrity_prep(bio))
-               return;
-
        bio_set_ioprio(bio);
 
-       rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs);
-       if (!rq) {
-               if (!bio)
+       if (plug) {
+               rq = rq_list_peek(&plug->cached_rq);
+               if (rq && rq->q != q)
+                       rq = NULL;
+       }
+       if (rq) {
+               if (!bio_integrity_prep(bio))
                        return;
-               rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
-               if (unlikely(!rq))
+               if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
                        return;
+               if (blk_mq_can_use_cached_rq(rq, plug, bio))
+                       goto done;
+               percpu_ref_get(&q->q_usage_counter);
+       } else {
+               if (unlikely(bio_queue_enter(bio)))
+                       return;
+               if (!bio_integrity_prep(bio))
+                       goto fail;
+       }
+
+       rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
+       if (unlikely(!rq)) {
+fail:
+               blk_queue_exit(q);
+               return;
        }
 
+done:
        trace_block_getrq(bio);
 
        rq_qos_track(q, rq, bio);
index 0ace218..e9b16cb 100644 (file)
@@ -250,9 +250,6 @@ int ivpu_rpm_get_if_active(struct ivpu_device *vdev)
 {
        int ret;
 
-       ivpu_dbg(vdev, RPM, "rpm_get_if_active count %d\n",
-                atomic_read(&vdev->drm.dev->power.usage_count));
-
        ret = pm_runtime_get_if_active(vdev->drm.dev, false);
        drm_WARN_ON(&vdev->drm, ret < 0);
 
index 74d00b0..4a98a85 100644 (file)
@@ -131,7 +131,7 @@ config RASPBERRYPI_FIRMWARE
 
 config FW_CFG_SYSFS
        tristate "QEMU fw_cfg device support in sysfs"
-       depends on SYSFS && (ARM || ARM64 || PARISC || PPC_PMAC || SPARC || X86)
+       depends on SYSFS && (ARM || ARM64 || PARISC || PPC_PMAC || RISCV || SPARC || X86)
        depends on HAS_IOPORT_MAP
        default n
        help
index a69399a..1448f61 100644 (file)
@@ -211,7 +211,7 @@ static void fw_cfg_io_cleanup(void)
 
 /* arch-specific ctrl & data register offsets are not available in ACPI, DT */
 #if !(defined(FW_CFG_CTRL_OFF) && defined(FW_CFG_DATA_OFF))
-# if (defined(CONFIG_ARM) || defined(CONFIG_ARM64))
+# if (defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV))
 #  define FW_CFG_CTRL_OFF 0x08
 #  define FW_CFG_DATA_OFF 0x00
 #  define FW_CFG_DMA_OFF 0x10
index afec099..9d92ca1 100644 (file)
@@ -248,6 +248,7 @@ extern int amdgpu_umsch_mm;
 extern int amdgpu_seamless;
 
 extern int amdgpu_user_partt_mode;
+extern int amdgpu_agp;
 
 #define AMDGPU_VM_MAX_NUM_CTX                  4096
 #define AMDGPU_SG_THRESHOLD                    (256*1024*1024)
index df3ecfa..e50be65 100644 (file)
@@ -207,7 +207,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
        }
 
        for (i = 0; i < p->nchunks; i++) {
-               struct drm_amdgpu_cs_chunk __user **chunk_ptr = NULL;
+               struct drm_amdgpu_cs_chunk __user *chunk_ptr = NULL;
                struct drm_amdgpu_cs_chunk user_chunk;
                uint32_t __user *cdata;
 
index 3095a3a..8f24cab 100644 (file)
@@ -207,6 +207,7 @@ int amdgpu_user_partt_mode = AMDGPU_AUTO_COMPUTE_PARTITION_MODE;
 int amdgpu_umsch_mm;
 int amdgpu_seamless = -1; /* auto */
 uint amdgpu_debug_mask;
+int amdgpu_agp = -1; /* auto */
 
 static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
 
@@ -961,6 +962,15 @@ module_param_named(seamless, amdgpu_seamless, int, 0444);
 MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default");
 module_param_named(debug_mask, amdgpu_debug_mask, uint, 0444);
 
+/**
+ * DOC: agp (int)
+ * Enable the AGP aperture.  This provides an aperture in the GPU's internal
+ * address space for direct access to system memory.  Note that these accesses
+ * are non-snooped, so they are only used for access to uncached memory.
+ */
+MODULE_PARM_DESC(agp, "AGP (-1 = auto (default), 0 = disable, 1 = enable)");
+module_param_named(agp, amdgpu_agp, int, 0444);
+
 /* These devices are not supported by amdgpu.
  * They are supported by the mach64, r128, radeon drivers
  */
index 32b701c..a21045d 100644 (file)
@@ -1473,6 +1473,11 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
                                topology->nodes[i].num_links = (requires_reflection && topology->nodes[i].num_links) ?
                                                                topology->nodes[i].num_links : node_num_links;
                        }
+                       /* popluate the connected port num info if supported and available */
+                       if (ta_port_num_support && topology->nodes[i].num_links) {
+                               memcpy(topology->nodes[i].port_num, link_extend_info_output->nodes[i].port_num,
+                                      sizeof(struct xgmi_connected_port_num) * TA_XGMI__MAX_PORT_NUM);
+                       }
 
                        /* reflect the topology information for bi-directionality */
                        if (requires_reflection && topology->nodes[i].num_hops)
index 5d36ad3..c4d9cbd 100644 (file)
@@ -150,6 +150,7 @@ struct psp_xgmi_node_info {
        uint8_t                                 is_sharing_enabled;
        enum ta_xgmi_assigned_sdma_engine       sdma_engine;
        uint8_t                                 num_links;
+       struct xgmi_connected_port_num          port_num[TA_XGMI__MAX_PORT_NUM];
 };
 
 struct psp_xgmi_topology_info {
index 84e5987..a3dc68e 100644 (file)
@@ -1188,7 +1188,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
                        }
 
                        if (block_obj->hw_ops->query_ras_error_count)
-                               block_obj->hw_ops->query_ras_error_count(adev, &err_data);
+                               block_obj->hw_ops->query_ras_error_count(adev, err_data);
 
                        if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
                            (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
index 65949cc..07d9303 100644 (file)
@@ -398,6 +398,7 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev)
  * amdgpu_uvd_entity_init - init entity
  *
  * @adev: amdgpu_device pointer
+ * @ring: amdgpu_ring pointer to check
  *
  * Initialize the entity used for handle management in the kernel driver.
  */
index 0954447..59acf42 100644 (file)
@@ -230,6 +230,7 @@ int amdgpu_vce_sw_fini(struct amdgpu_device *adev)
  * amdgpu_vce_entity_init - init entity
  *
  * @adev: amdgpu_device pointer
+ * @ring: amdgpu_ring pointer to check
  *
  * Initialize the entity used for handle management in the kernel driver.
  */
index 0ec7b06..a5a05c1 100644 (file)
@@ -675,7 +675,7 @@ static void gmc_v10_0_vram_gtt_location(struct amdgpu_device *adev,
        amdgpu_gmc_set_agp_default(adev, mc);
        amdgpu_gmc_vram_location(adev, &adev->gmc, base);
        amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
-       if (!amdgpu_sriov_vf(adev))
+       if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1))
                amdgpu_gmc_agp_location(adev, mc);
 
        /* base offset of vram pages */
index 6dce9b2..23d7b54 100644 (file)
@@ -640,8 +640,9 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev,
        amdgpu_gmc_set_agp_default(adev, mc);
        amdgpu_gmc_vram_location(adev, &adev->gmc, base);
        amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH);
-       if (!amdgpu_sriov_vf(adev) ||
-           (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)))
+       if (!amdgpu_sriov_vf(adev) &&
+           (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)) &&
+           (amdgpu_agp == 1))
                amdgpu_gmc_agp_location(adev, mc);
 
        /* base offset of vram pages */
index bde25eb..2ac5820 100644 (file)
@@ -1630,7 +1630,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev,
        } else {
                amdgpu_gmc_vram_location(adev, mc, base);
                amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT);
-               if (!amdgpu_sriov_vf(adev))
+               if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1))
                        amdgpu_gmc_agp_location(adev, mc);
        }
        /* base offset of vram pages */
@@ -2170,8 +2170,6 @@ static int gmc_v9_0_sw_fini(void *handle)
 
        if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3))
                amdgpu_gmc_sysfs_fini(adev);
-       adev->gmc.num_mem_partitions = 0;
-       kfree(adev->gmc.mem_partitions);
 
        amdgpu_gmc_ras_fini(adev);
        amdgpu_gem_force_release(adev);
@@ -2185,6 +2183,9 @@ static int gmc_v9_0_sw_fini(void *handle)
        amdgpu_bo_free_kernel(&adev->gmc.pdb0_bo, NULL, &adev->gmc.ptr_pdb0);
        amdgpu_bo_fini(adev);
 
+       adev->gmc.num_mem_partitions = 0;
+       kfree(adev->gmc.mem_partitions);
+
        return 0;
 }
 
index ea14261..9b01467 100644 (file)
@@ -130,6 +130,9 @@ static void mmhub_v1_8_init_system_aperture_regs(struct amdgpu_device *adev)
        uint64_t value;
        int i;
 
+       if (amdgpu_sriov_vf(adev))
+               return;
+
        inst_mask = adev->aid_mask;
        for_each_inst(i, inst_mask) {
                /* Program the AGP BAR */
@@ -139,9 +142,6 @@ static void mmhub_v1_8_init_system_aperture_regs(struct amdgpu_device *adev)
                WREG32_SOC15(MMHUB, i, regMC_VM_AGP_TOP,
                             adev->gmc.agp_end >> 24);
 
-               if (amdgpu_sriov_vf(adev))
-                       return;
-
                /* Program the system aperture low logical page number. */
                WREG32_SOC15(MMHUB, i, regMC_VM_SYSTEM_APERTURE_LOW_ADDR,
                        min(adev->gmc.fb_start, adev->gmc.agp_start) >> 18);
index 6f99f67..ee97814 100644 (file)
@@ -2079,7 +2079,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev)
        struct dmub_srv_create_params create_params;
        struct dmub_srv_region_params region_params;
        struct dmub_srv_region_info region_info;
-       struct dmub_srv_fb_params fb_params;
+       struct dmub_srv_memory_params memory_params;
        struct dmub_srv_fb_info *fb_info;
        struct dmub_srv *dmub_srv;
        const struct dmcub_firmware_header_v1_0 *hdr;
@@ -2182,6 +2182,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev)
                adev->dm.dmub_fw->data +
                le32_to_cpu(hdr->header.ucode_array_offset_bytes) +
                PSP_HEADER_BYTES;
+       region_params.is_mailbox_in_inbox = false;
 
        status = dmub_srv_calc_region_info(dmub_srv, &region_params,
                                           &region_info);
@@ -2205,10 +2206,10 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev)
                return r;
 
        /* Rebase the regions on the framebuffer address. */
-       memset(&fb_params, 0, sizeof(fb_params));
-       fb_params.cpu_addr = adev->dm.dmub_bo_cpu_addr;
-       fb_params.gpu_addr = adev->dm.dmub_bo_gpu_addr;
-       fb_params.region_info = &region_info;
+       memset(&memory_params, 0, sizeof(memory_params));
+       memory_params.cpu_fb_addr = adev->dm.dmub_bo_cpu_addr;
+       memory_params.gpu_fb_addr = adev->dm.dmub_bo_gpu_addr;
+       memory_params.region_info = &region_info;
 
        adev->dm.dmub_fb_info =
                kzalloc(sizeof(*adev->dm.dmub_fb_info), GFP_KERNEL);
@@ -2220,7 +2221,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev)
                return -ENOMEM;
        }
 
-       status = dmub_srv_calc_fb_info(dmub_srv, &fb_params, fb_info);
+       status = dmub_srv_calc_mem_info(dmub_srv, &memory_params, fb_info);
        if (status != DMUB_STATUS_OK) {
                DRM_ERROR("Error calculating DMUB FB info: %d\n", status);
                return -EINVAL;
@@ -7481,6 +7482,9 @@ static int amdgpu_dm_i2c_xfer(struct i2c_adapter *i2c_adap,
        int i;
        int result = -EIO;
 
+       if (!ddc_service->ddc_pin || !ddc_service->ddc_pin->hw_info.hw_supported)
+               return result;
+
        cmd.payloads = kcalloc(num, sizeof(struct i2c_payload), GFP_KERNEL);
 
        if (!cmd.payloads)
@@ -9603,14 +9607,14 @@ static bool should_reset_plane(struct drm_atomic_state *state,
        struct drm_plane *other;
        struct drm_plane_state *old_other_state, *new_other_state;
        struct drm_crtc_state *new_crtc_state;
+       struct amdgpu_device *adev = drm_to_adev(plane->dev);
        int i;
 
        /*
-        * TODO: Remove this hack once the checks below are sufficient
-        * enough to determine when we need to reset all the planes on
-        * the stream.
+        * TODO: Remove this hack for all asics once it proves that the
+        * fast updates works fine on DCN3.2+.
         */
-       if (state->allow_modeset)
+       if (adev->ip_versions[DCE_HWIP][0] < IP_VERSION(3, 2, 0) && state->allow_modeset)
                return true;
 
        /* Exit early if we know that we're adding or removing the plane. */
index ed784cf..c7a29bb 100644 (file)
@@ -536,11 +536,8 @@ bool dm_helpers_dp_read_dpcd(
 
        struct amdgpu_dm_connector *aconnector = link->priv;
 
-       if (!aconnector) {
-               drm_dbg_dp(aconnector->base.dev,
-                          "Failed to find connector for link!\n");
+       if (!aconnector)
                return false;
-       }
 
        return drm_dp_dpcd_read(&aconnector->dm_dp_aux.aux, address, data,
                                size) == size;
index d3b13d3..11da0ee 100644 (file)
@@ -1604,31 +1604,31 @@ enum dc_status dm_dp_mst_is_port_support_mode(
        unsigned int upper_link_bw_in_kbps = 0, down_link_bw_in_kbps = 0;
        unsigned int max_compressed_bw_in_kbps = 0;
        struct dc_dsc_bw_range bw_range = {0};
-       struct drm_dp_mst_topology_mgr *mst_mgr;
+       uint16_t full_pbn = aconnector->mst_output_port->full_pbn;
 
        /*
-        * check if the mode could be supported if DSC pass-through is supported
-        * AND check if there enough bandwidth available to support the mode
-        * with DSC enabled.
+        * Consider the case with the depth of the mst topology tree is equal or less than 2
+        * A. When dsc bitstream can be transmitted along the entire path
+        *    1. dsc is possible between source and branch/leaf device (common dsc params is possible), AND
+        *    2. dsc passthrough supported at MST branch, or
+        *    3. dsc decoding supported at leaf MST device
+        *    Use maximum dsc compression as bw constraint
+        * B. When dsc bitstream cannot be transmitted along the entire path
+        *    Use native bw as bw constraint
         */
        if (is_dsc_common_config_possible(stream, &bw_range) &&
-           aconnector->mst_output_port->passthrough_aux) {
-               mst_mgr = aconnector->mst_output_port->mgr;
-               mutex_lock(&mst_mgr->lock);
-
+          (aconnector->mst_output_port->passthrough_aux ||
+           aconnector->dsc_aux == &aconnector->mst_output_port->aux)) {
                cur_link_settings = stream->link->verified_link_cap;
 
                upper_link_bw_in_kbps = dc_link_bandwidth_kbps(aconnector->dc_link,
-                                                              &cur_link_settings
-                                                              );
-               down_link_bw_in_kbps = kbps_from_pbn(aconnector->mst_output_port->full_pbn);
+                                                              &cur_link_settings);
+               down_link_bw_in_kbps = kbps_from_pbn(full_pbn);
 
                /* pick the bottleneck */
                end_to_end_bw_in_kbps = min(upper_link_bw_in_kbps,
                                            down_link_bw_in_kbps);
 
-               mutex_unlock(&mst_mgr->lock);
-
                /*
                 * use the maximum dsc compression bandwidth as the required
                 * bandwidth for the mode
@@ -1643,8 +1643,7 @@ enum dc_status dm_dp_mst_is_port_support_mode(
                /* check if mode could be supported within full_pbn */
                bpp = convert_dc_color_depth_into_bpc(stream->timing.display_color_depth) * 3;
                pbn = drm_dp_calc_pbn_mode(stream->timing.pix_clk_100hz / 10, bpp, false);
-
-               if (pbn > aconnector->mst_output_port->full_pbn)
+               if (pbn > full_pbn)
                        return DC_FAIL_BANDWIDTH_VALIDATE;
        }
 
index 0fa4fcd..507a7cf 100644 (file)
@@ -820,22 +820,22 @@ static void dcn35_set_idle_state(struct clk_mgr *clk_mgr_base, bool allow_idle)
 
        if (dc->config.disable_ips == DMUB_IPS_ENABLE ||
                dc->config.disable_ips == DMUB_IPS_DISABLE_DYNAMIC) {
-               val |= DMUB_IPS1_ALLOW_MASK;
-               val |= DMUB_IPS2_ALLOW_MASK;
-       } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS1) {
                val = val & ~DMUB_IPS1_ALLOW_MASK;
                val = val & ~DMUB_IPS2_ALLOW_MASK;
-       } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2) {
-               val |= DMUB_IPS1_ALLOW_MASK;
-               val = val & ~DMUB_IPS2_ALLOW_MASK;
-       } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2_Z10) {
+       } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS1) {
                val |= DMUB_IPS1_ALLOW_MASK;
                val |= DMUB_IPS2_ALLOW_MASK;
+       } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2) {
+               val = val & ~DMUB_IPS1_ALLOW_MASK;
+               val |= DMUB_IPS2_ALLOW_MASK;
+       } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2_Z10) {
+               val = val & ~DMUB_IPS1_ALLOW_MASK;
+               val = val & ~DMUB_IPS2_ALLOW_MASK;
        }
 
        if (!allow_idle) {
-               val = val & ~DMUB_IPS1_ALLOW_MASK;
-               val = val & ~DMUB_IPS2_ALLOW_MASK;
+               val |= DMUB_IPS1_ALLOW_MASK;
+               val |= DMUB_IPS2_ALLOW_MASK;
        }
 
        dcn35_smu_write_ips_scratch(clk_mgr, val);
index 7b9bf5c..76b47f1 100644 (file)
@@ -3178,7 +3178,7 @@ static bool update_planes_and_stream_state(struct dc *dc,
                        struct pipe_ctx *otg_master = resource_get_otg_master_for_stream(&context->res_ctx,
                                        context->streams[i]);
 
-                       if (otg_master->stream->test_pattern.type != DP_TEST_PATTERN_VIDEO_MODE)
+                       if (otg_master && otg_master->stream->test_pattern.type != DP_TEST_PATTERN_VIDEO_MODE)
                                resource_build_test_pattern_params(&context->res_ctx, otg_master);
                }
        }
@@ -4934,8 +4934,8 @@ bool dc_dmub_is_ips_idle_state(struct dc *dc)
        if (dc->hwss.get_idle_state)
                idle_state = dc->hwss.get_idle_state(dc);
 
-       if ((idle_state & DMUB_IPS1_ALLOW_MASK) ||
-               (idle_state & DMUB_IPS2_ALLOW_MASK))
+       if (!(idle_state & DMUB_IPS1_ALLOW_MASK) ||
+               !(idle_state & DMUB_IPS2_ALLOW_MASK))
                return true;
 
        return false;
index 1d48278..a1f1d10 100644 (file)
@@ -5190,6 +5190,9 @@ bool dc_resource_acquire_secondary_pipe_for_mpc_odm_legacy(
        sec_next = sec_pipe->next_odm_pipe;
        sec_prev = sec_pipe->prev_odm_pipe;
 
+       if (pri_pipe == NULL)
+               return false;
+
        *sec_pipe = *pri_pipe;
 
        sec_pipe->top_pipe = sec_top;
index e4c0072..0e07699 100644 (file)
@@ -1202,11 +1202,11 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc)
                allow_state = dc->hwss.get_idle_state(dc);
                dc->hwss.set_idle_state(dc, false);
 
-               if (allow_state & DMUB_IPS2_ALLOW_MASK) {
+               if (!(allow_state & DMUB_IPS2_ALLOW_MASK)) {
                        // Wait for evaluation time
                        udelay(dc->debug.ips2_eval_delay_us);
                        commit_state = dc->hwss.get_idle_state(dc);
-                       if (commit_state & DMUB_IPS2_COMMIT_MASK) {
+                       if (!(commit_state & DMUB_IPS2_COMMIT_MASK)) {
                                // Tell PMFW to exit low power state
                                dc->clk_mgr->funcs->exit_low_power_state(dc->clk_mgr);
 
@@ -1216,7 +1216,7 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc)
 
                                for (i = 0; i < max_num_polls; ++i) {
                                        commit_state = dc->hwss.get_idle_state(dc);
-                                       if (!(commit_state & DMUB_IPS2_COMMIT_MASK))
+                                       if (commit_state & DMUB_IPS2_COMMIT_MASK)
                                                break;
 
                                        udelay(1);
@@ -1235,10 +1235,10 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc)
                }
 
                dc_dmub_srv_notify_idle(dc, false);
-               if (allow_state & DMUB_IPS1_ALLOW_MASK) {
+               if (!(allow_state & DMUB_IPS1_ALLOW_MASK)) {
                        for (i = 0; i < max_num_polls; ++i) {
                                commit_state = dc->hwss.get_idle_state(dc);
-                               if (!(commit_state & DMUB_IPS1_COMMIT_MASK))
+                               if (commit_state & DMUB_IPS1_COMMIT_MASK)
                                        break;
 
                                udelay(1);
index cea666e..fcb825e 100644 (file)
@@ -177,6 +177,7 @@ struct dc_panel_patch {
        unsigned int disable_fams;
        unsigned int skip_avmute;
        unsigned int mst_start_top_delay;
+       unsigned int remove_sink_ext_caps;
 };
 
 struct dc_edid_caps {
index 001f9eb..62a8f0b 100644 (file)
@@ -261,12 +261,6 @@ static void enc35_stream_encoder_enable(
                        /* invalid mode ! */
                        ASSERT_CRITICAL(false);
                }
-
-               REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 1);
-               REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 1);
-       } else {
-               REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 0);
-               REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 0);
        }
 }
 
@@ -436,6 +430,8 @@ static void enc35_disable_fifo(struct stream_encoder *enc)
        struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc);
 
        REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_ENABLE, 0);
+       REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 0);
+       REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 0);
 }
 
 static void enc35_enable_fifo(struct stream_encoder *enc)
@@ -443,6 +439,8 @@ static void enc35_enable_fifo(struct stream_encoder *enc)
        struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc);
 
        REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_READ_START_LEVEL, 0x7);
+       REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 1);
+       REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 1);
 
        enc35_reset_fifo(enc, true);
        enc35_reset_fifo(enc, false);
index d6f0f85..f2fe523 100644 (file)
@@ -1088,6 +1088,9 @@ static bool detect_link_and_local_sink(struct dc_link *link,
                if (sink->edid_caps.panel_patch.skip_scdc_overwrite)
                        link->ctx->dc->debug.hdmi20_disable = true;
 
+               if (sink->edid_caps.panel_patch.remove_sink_ext_caps)
+                       link->dpcd_sink_ext_caps.raw = 0;
+
                if (dc_is_hdmi_signal(link->connector_signal))
                        read_scdc_caps(link->ddc, link->local_sink);
 
index 9665ada..df63aa8 100644 (file)
@@ -195,6 +195,7 @@ struct dmub_srv_region_params {
        uint32_t vbios_size;
        const uint8_t *fw_inst_const;
        const uint8_t *fw_bss_data;
+       bool is_mailbox_in_inbox;
 };
 
 /**
@@ -214,20 +215,25 @@ struct dmub_srv_region_params {
  */
 struct dmub_srv_region_info {
        uint32_t fb_size;
+       uint32_t inbox_size;
        uint8_t num_regions;
        struct dmub_region regions[DMUB_WINDOW_TOTAL];
 };
 
 /**
- * struct dmub_srv_fb_params - parameters used for driver fb setup
+ * struct dmub_srv_memory_params - parameters used for driver fb setup
  * @region_info: region info calculated by dmub service
- * @cpu_addr: base cpu address for the framebuffer
- * @gpu_addr: base gpu virtual address for the framebuffer
+ * @cpu_fb_addr: base cpu address for the framebuffer
+ * @cpu_inbox_addr: base cpu address for the gart
+ * @gpu_fb_addr: base gpu virtual address for the framebuffer
+ * @gpu_inbox_addr: base gpu virtual address for the gart
  */
-struct dmub_srv_fb_params {
+struct dmub_srv_memory_params {
        const struct dmub_srv_region_info *region_info;
-       void *cpu_addr;
-       uint64_t gpu_addr;
+       void *cpu_fb_addr;
+       void *cpu_inbox_addr;
+       uint64_t gpu_fb_addr;
+       uint64_t gpu_inbox_addr;
 };
 
 /**
@@ -563,8 +569,8 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub,
  *   DMUB_STATUS_OK - success
  *   DMUB_STATUS_INVALID - unspecified error
  */
-enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub,
-                                      const struct dmub_srv_fb_params *params,
+enum dmub_status dmub_srv_calc_mem_info(struct dmub_srv *dmub,
+                                      const struct dmub_srv_memory_params *params,
                                       struct dmub_srv_fb_info *out);
 
 /**
index e43e8d4..22fc4ba 100644 (file)
@@ -434,7 +434,7 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub,
        uint32_t fw_state_size = DMUB_FW_STATE_SIZE;
        uint32_t trace_buffer_size = DMUB_TRACE_BUFFER_SIZE;
        uint32_t scratch_mem_size = DMUB_SCRATCH_MEM_SIZE;
-
+       uint32_t previous_top = 0;
        if (!dmub->sw_init)
                return DMUB_STATUS_INVALID;
 
@@ -459,8 +459,15 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub,
        bios->base = dmub_align(stack->top, 256);
        bios->top = bios->base + params->vbios_size;
 
-       mail->base = dmub_align(bios->top, 256);
-       mail->top = mail->base + DMUB_MAILBOX_SIZE;
+       if (params->is_mailbox_in_inbox) {
+               mail->base = 0;
+               mail->top = mail->base + DMUB_MAILBOX_SIZE;
+               previous_top = bios->top;
+       } else {
+               mail->base = dmub_align(bios->top, 256);
+               mail->top = mail->base + DMUB_MAILBOX_SIZE;
+               previous_top = mail->top;
+       }
 
        fw_info = dmub_get_fw_meta_info(params);
 
@@ -479,7 +486,7 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub,
                        dmub->fw_version = fw_info->fw_version;
        }
 
-       trace_buff->base = dmub_align(mail->top, 256);
+       trace_buff->base = dmub_align(previous_top, 256);
        trace_buff->top = trace_buff->base + dmub_align(trace_buffer_size, 64);
 
        fw_state->base = dmub_align(trace_buff->top, 256);
@@ -490,11 +497,14 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub,
 
        out->fb_size = dmub_align(scratch_mem->top, 4096);
 
+       if (params->is_mailbox_in_inbox)
+               out->inbox_size = dmub_align(mail->top, 4096);
+
        return DMUB_STATUS_OK;
 }
 
-enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub,
-                                      const struct dmub_srv_fb_params *params,
+enum dmub_status dmub_srv_calc_mem_info(struct dmub_srv *dmub,
+                                      const struct dmub_srv_memory_params *params,
                                       struct dmub_srv_fb_info *out)
 {
        uint8_t *cpu_base;
@@ -509,8 +519,8 @@ enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub,
        if (params->region_info->num_regions != DMUB_NUM_WINDOWS)
                return DMUB_STATUS_INVALID;
 
-       cpu_base = (uint8_t *)params->cpu_addr;
-       gpu_base = params->gpu_addr;
+       cpu_base = (uint8_t *)params->cpu_fb_addr;
+       gpu_base = params->gpu_fb_addr;
 
        for (i = 0; i < DMUB_NUM_WINDOWS; ++i) {
                const struct dmub_region *reg =
@@ -518,6 +528,12 @@ enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub,
 
                out->fb[i].cpu_addr = cpu_base + reg->base;
                out->fb[i].gpu_addr = gpu_base + reg->base;
+
+               if (i == DMUB_WINDOW_4_MAILBOX && params->cpu_inbox_addr != 0) {
+                       out->fb[i].cpu_addr = (uint8_t *)params->cpu_inbox_addr + reg->base;
+                       out->fb[i].gpu_addr = params->gpu_inbox_addr + reg->base;
+               }
+
                out->fb[i].size = reg->top - reg->base;
        }
 
@@ -707,9 +723,16 @@ enum dmub_status dmub_srv_sync_inbox1(struct dmub_srv *dmub)
                return DMUB_STATUS_INVALID;
 
        if (dmub->hw_funcs.get_inbox1_rptr && dmub->hw_funcs.get_inbox1_wptr) {
-               dmub->inbox1_rb.rptr = dmub->hw_funcs.get_inbox1_rptr(dmub);
-               dmub->inbox1_rb.wrpt = dmub->hw_funcs.get_inbox1_wptr(dmub);
-               dmub->inbox1_last_wptr = dmub->inbox1_rb.wrpt;
+               uint32_t rptr = dmub->hw_funcs.get_inbox1_rptr(dmub);
+               uint32_t wptr = dmub->hw_funcs.get_inbox1_wptr(dmub);
+
+               if (rptr > dmub->inbox1_rb.capacity || wptr > dmub->inbox1_rb.capacity) {
+                       return DMUB_STATUS_HW_FAILURE;
+               } else {
+                       dmub->inbox1_rb.rptr = rptr;
+                       dmub->inbox1_rb.wrpt = wptr;
+                       dmub->inbox1_last_wptr = dmub->inbox1_rb.wrpt;
+               }
        }
 
        return DMUB_STATUS_OK;
@@ -743,6 +766,11 @@ enum dmub_status dmub_srv_cmd_queue(struct dmub_srv *dmub,
        if (!dmub->hw_init)
                return DMUB_STATUS_INVALID;
 
+       if (dmub->inbox1_rb.rptr > dmub->inbox1_rb.capacity ||
+           dmub->inbox1_rb.wrpt > dmub->inbox1_rb.capacity) {
+               return DMUB_STATUS_HW_FAILURE;
+       }
+
        if (dmub_rb_push_front(&dmub->inbox1_rb, cmd))
                return DMUB_STATUS_OK;
 
index dab35d8..fef2d29 100644 (file)
@@ -123,7 +123,7 @@ typedef enum {
   VOLTAGE_GUARDBAND_COUNT
 } GFX_GUARDBAND_e;
 
-#define SMU_METRICS_TABLE_VERSION 0x8
+#define SMU_METRICS_TABLE_VERSION 0x9
 
 typedef struct __attribute__((packed, aligned(4))) {
   uint32_t AccumulationCounter;
@@ -211,6 +211,14 @@ typedef struct __attribute__((packed, aligned(4))) {
   //XGMI Data tranfser size
   uint64_t XgmiReadDataSizeAcc[8];//in KByte
   uint64_t XgmiWriteDataSizeAcc[8];//in KByte
+
+  //PCIE BW Data and error count
+  uint32_t PcieBandwidth[4];
+  uint32_t PCIeL0ToRecoveryCountAcc;      // The Pcie counter itself is accumulated
+  uint32_t PCIenReplayAAcc;               // The Pcie counter itself is accumulated
+  uint32_t PCIenReplayARolloverCountAcc;  // The Pcie counter itself is accumulated
+  uint32_t PCIeNAKSentCountAcc;           // The Pcie counter itself is accumulated
+  uint32_t PCIeNAKReceivedCountAcc;       // The Pcie counter itself is accumulated
 } MetricsTable_t;
 
 #define SMU_VF_METRICS_TABLE_VERSION 0x3
index 891605d..0e5a77c 100644 (file)
@@ -1454,7 +1454,7 @@ static int smu_v13_0_6_register_irq_handler(struct smu_context *smu)
 
 static int smu_v13_0_6_notify_unload(struct smu_context *smu)
 {
-       if (smu->smc_fw_version <= 0x553500)
+       if (amdgpu_in_reset(smu->adev))
                return 0;
 
        dev_dbg(smu->adev->dev, "Notify PMFW about driver unload");
@@ -2095,6 +2095,14 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table
                        smu_v13_0_6_get_current_pcie_link_speed(smu);
                gpu_metrics->pcie_bandwidth_acc =
                                SMUQ10_ROUND(metrics->PcieBandwidthAcc[0]);
+               gpu_metrics->pcie_bandwidth_inst =
+                               SMUQ10_ROUND(metrics->PcieBandwidth[0]);
+               gpu_metrics->pcie_l0_to_recov_count_acc =
+                               metrics->PCIeL0ToRecoveryCountAcc;
+               gpu_metrics->pcie_replay_count_acc =
+                               metrics->PCIenReplayAAcc;
+               gpu_metrics->pcie_replay_rover_count_acc =
+                               metrics->PCIenReplayARolloverCountAcc;
        }
 
        gpu_metrics->system_clock_counter = ktime_get_boottime_ns();
index d8856d1..e9994c9 100644 (file)
@@ -5,7 +5,7 @@ termcolor==2.3.0
 certifi==2023.7.22
 charset-normalizer==3.2.0
 idna==3.4
-pip==23.2.1
+pip==23.3
 python-gitlab==3.15.0
 requests==2.31.0
 requests-toolbelt==1.0.0
@@ -13,5 +13,5 @@ ruamel.yaml==0.17.32
 ruamel.yaml.clib==0.2.7
 setuptools==68.0.0
 tenacity==8.2.3
-urllib3==2.0.4
-wheel==0.41.1
\ No newline at end of file
+urllib3==2.0.7
+wheel==0.41.1
index d5c1529..3d92f66 100644 (file)
@@ -336,6 +336,12 @@ static const struct dmi_system_id orientation_data[] = {
                  DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "IdeaPad Duet 3 10IGL5"),
                },
                .driver_data = (void *)&lcd1200x1920_rightside_up,
+       }, {    /* Lenovo Legion Go 8APU1 */
+               .matches = {
+                 DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+                 DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "Legion Go 8APU1"),
+               },
+               .driver_data = (void *)&lcd1600x2560_leftside_up,
        }, {    /* Lenovo Yoga Book X90F / X90L */
                .matches = {
                  DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Intel Corporation"),
index 82b267c..460459a 100644 (file)
@@ -14,7 +14,7 @@ struct nvkm_event {
        int index_nr;
 
        spinlock_t refs_lock;
-       spinlock_t list_lock;
+       rwlock_t list_lock;
        int *refs;
 
        struct list_head ntfy;
@@ -38,7 +38,7 @@ nvkm_event_init(const struct nvkm_event_func *func, struct nvkm_subdev *subdev,
                int types_nr, int index_nr, struct nvkm_event *event)
 {
        spin_lock_init(&event->refs_lock);
-       spin_lock_init(&event->list_lock);
+       rwlock_init(&event->list_lock);
        return __nvkm_event_init(func, subdev, types_nr, index_nr, event);
 }
 
index d8c9252..f28f9a8 100644 (file)
@@ -726,6 +726,11 @@ nouveau_display_create(struct drm_device *dev)
 
        if (nouveau_modeset != 2) {
                ret = nvif_disp_ctor(&drm->client.device, "kmsDisp", 0, &disp->disp);
+               /* no display hw */
+               if (ret == -ENODEV) {
+                       ret = 0;
+                       goto disp_create_err;
+               }
 
                if (!ret && (disp->disp.outp_mask || drm->vbios.dcb.entries)) {
                        nouveau_display_create_properties(dev);
index a6c8771..61fed77 100644 (file)
@@ -81,17 +81,17 @@ nvkm_event_ntfy_state(struct nvkm_event_ntfy *ntfy)
 static void
 nvkm_event_ntfy_remove(struct nvkm_event_ntfy *ntfy)
 {
-       spin_lock_irq(&ntfy->event->list_lock);
+       write_lock_irq(&ntfy->event->list_lock);
        list_del_init(&ntfy->head);
-       spin_unlock_irq(&ntfy->event->list_lock);
+       write_unlock_irq(&ntfy->event->list_lock);
 }
 
 static void
 nvkm_event_ntfy_insert(struct nvkm_event_ntfy *ntfy)
 {
-       spin_lock_irq(&ntfy->event->list_lock);
+       write_lock_irq(&ntfy->event->list_lock);
        list_add_tail(&ntfy->head, &ntfy->event->ntfy);
-       spin_unlock_irq(&ntfy->event->list_lock);
+       write_unlock_irq(&ntfy->event->list_lock);
 }
 
 static void
@@ -176,7 +176,7 @@ nvkm_event_ntfy(struct nvkm_event *event, int id, u32 bits)
                return;
 
        nvkm_trace(event->subdev, "event: ntfy %08x on %d\n", bits, id);
-       spin_lock_irqsave(&event->list_lock, flags);
+       read_lock_irqsave(&event->list_lock, flags);
 
        list_for_each_entry_safe(ntfy, ntmp, &event->ntfy, head) {
                if (ntfy->id == id && ntfy->bits & bits) {
@@ -185,7 +185,7 @@ nvkm_event_ntfy(struct nvkm_event *event, int id, u32 bits)
                }
        }
 
-       spin_unlock_irqrestore(&event->list_lock, flags);
+       read_unlock_irqrestore(&event->list_lock, flags);
 }
 
 void
index e31f964..dc44f5c 100644 (file)
@@ -689,8 +689,8 @@ r535_gsp_rpc_get(struct nvkm_gsp *gsp, u32 fn, u32 argc)
        struct nvfw_gsp_rpc *rpc;
 
        rpc = r535_gsp_cmdq_get(gsp, ALIGN(sizeof(*rpc) + argc, sizeof(u64)));
-       if (!rpc)
-               return NULL;
+       if (IS_ERR(rpc))
+               return ERR_CAST(rpc);
 
        rpc->header_version = 0x03000000;
        rpc->signature = ('C' << 24) | ('P' << 16) | ('R' << 8) | 'V';
@@ -1159,7 +1159,7 @@ static void
 r535_gsp_acpi_mux_id(acpi_handle handle, u32 id, MUX_METHOD_DATA_ELEMENT *mode,
                                                 MUX_METHOD_DATA_ELEMENT *part)
 {
-       acpi_handle iter = NULL, handle_mux;
+       acpi_handle iter = NULL, handle_mux = NULL;
        acpi_status status;
        unsigned long long value;
 
index affcfb2..35f7628 100644 (file)
@@ -63,7 +63,7 @@ static int dw_reg_read(void *context, unsigned int reg, unsigned int *val)
 {
        struct dw_i2c_dev *dev = context;
 
-       *val = readl_relaxed(dev->base + reg);
+       *val = readl(dev->base + reg);
 
        return 0;
 }
@@ -72,7 +72,7 @@ static int dw_reg_write(void *context, unsigned int reg, unsigned int val)
 {
        struct dw_i2c_dev *dev = context;
 
-       writel_relaxed(val, dev->base + reg);
+       writel(val, dev->base + reg);
 
        return 0;
 }
@@ -81,7 +81,7 @@ static int dw_reg_read_swab(void *context, unsigned int reg, unsigned int *val)
 {
        struct dw_i2c_dev *dev = context;
 
-       *val = swab32(readl_relaxed(dev->base + reg));
+       *val = swab32(readl(dev->base + reg));
 
        return 0;
 }
@@ -90,7 +90,7 @@ static int dw_reg_write_swab(void *context, unsigned int reg, unsigned int val)
 {
        struct dw_i2c_dev *dev = context;
 
-       writel_relaxed(swab32(val), dev->base + reg);
+       writel(swab32(val), dev->base + reg);
 
        return 0;
 }
@@ -99,8 +99,8 @@ static int dw_reg_read_word(void *context, unsigned int reg, unsigned int *val)
 {
        struct dw_i2c_dev *dev = context;
 
-       *val = readw_relaxed(dev->base + reg) |
-               (readw_relaxed(dev->base + reg + 2) << 16);
+       *val = readw(dev->base + reg) |
+               (readw(dev->base + reg + 2) << 16);
 
        return 0;
 }
@@ -109,8 +109,8 @@ static int dw_reg_write_word(void *context, unsigned int reg, unsigned int val)
 {
        struct dw_i2c_dev *dev = context;
 
-       writew_relaxed(val, dev->base + reg);
-       writew_relaxed(val >> 16, dev->base + reg + 2);
+       writew(val, dev->base + reg);
+       writew(val >> 16, dev->base + reg + 2);
 
        return 0;
 }
index 041a76f..e106af8 100644 (file)
@@ -771,8 +771,8 @@ static int ocores_i2c_resume(struct device *dev)
        return ocores_init(dev, i2c);
 }
 
-static DEFINE_SIMPLE_DEV_PM_OPS(ocores_i2c_pm,
-                               ocores_i2c_suspend, ocores_i2c_resume);
+static DEFINE_NOIRQ_DEV_PM_OPS(ocores_i2c_pm,
+                              ocores_i2c_suspend, ocores_i2c_resume);
 
 static struct platform_driver ocores_i2c_driver = {
        .probe   = ocores_i2c_probe,
index 1d76482..76f79b6 100644 (file)
@@ -265,6 +265,9 @@ struct pxa_i2c {
        u32                     hs_mask;
 
        struct i2c_bus_recovery_info recovery;
+       struct pinctrl          *pinctrl;
+       struct pinctrl_state    *pinctrl_default;
+       struct pinctrl_state    *pinctrl_recovery;
 };
 
 #define _IBMR(i2c)     ((i2c)->reg_ibmr)
@@ -1299,12 +1302,13 @@ static void i2c_pxa_prepare_recovery(struct i2c_adapter *adap)
         */
        gpiod_set_value(i2c->recovery.scl_gpiod, ibmr & IBMR_SCLS);
        gpiod_set_value(i2c->recovery.sda_gpiod, ibmr & IBMR_SDAS);
+
+       WARN_ON(pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_recovery));
 }
 
 static void i2c_pxa_unprepare_recovery(struct i2c_adapter *adap)
 {
        struct pxa_i2c *i2c = adap->algo_data;
-       struct i2c_bus_recovery_info *bri = adap->bus_recovery_info;
        u32 isr;
 
        /*
@@ -1318,7 +1322,7 @@ static void i2c_pxa_unprepare_recovery(struct i2c_adapter *adap)
                i2c_pxa_do_reset(i2c);
        }
 
-       WARN_ON(pinctrl_select_state(bri->pinctrl, bri->pins_default));
+       WARN_ON(pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_default));
 
        dev_dbg(&i2c->adap.dev, "recovery: IBMR 0x%08x ISR 0x%08x\n",
                readl(_IBMR(i2c)), readl(_ISR(i2c)));
@@ -1340,20 +1344,76 @@ static int i2c_pxa_init_recovery(struct pxa_i2c *i2c)
        if (IS_ENABLED(CONFIG_I2C_PXA_SLAVE))
                return 0;
 
-       bri->pinctrl = devm_pinctrl_get(dev);
-       if (PTR_ERR(bri->pinctrl) == -ENODEV) {
-               bri->pinctrl = NULL;
+       i2c->pinctrl = devm_pinctrl_get(dev);
+       if (PTR_ERR(i2c->pinctrl) == -ENODEV)
+               i2c->pinctrl = NULL;
+       if (IS_ERR(i2c->pinctrl))
+               return PTR_ERR(i2c->pinctrl);
+
+       if (!i2c->pinctrl)
+               return 0;
+
+       i2c->pinctrl_default = pinctrl_lookup_state(i2c->pinctrl,
+                                                   PINCTRL_STATE_DEFAULT);
+       i2c->pinctrl_recovery = pinctrl_lookup_state(i2c->pinctrl, "recovery");
+
+       if (IS_ERR(i2c->pinctrl_default) || IS_ERR(i2c->pinctrl_recovery)) {
+               dev_info(dev, "missing pinmux recovery information: %ld %ld\n",
+                        PTR_ERR(i2c->pinctrl_default),
+                        PTR_ERR(i2c->pinctrl_recovery));
+               return 0;
+       }
+
+       /*
+        * Claiming GPIOs can influence the pinmux state, and may glitch the
+        * I2C bus. Do this carefully.
+        */
+       bri->scl_gpiod = devm_gpiod_get(dev, "scl", GPIOD_OUT_HIGH_OPEN_DRAIN);
+       if (bri->scl_gpiod == ERR_PTR(-EPROBE_DEFER))
+               return -EPROBE_DEFER;
+       if (IS_ERR(bri->scl_gpiod)) {
+               dev_info(dev, "missing scl gpio recovery information: %pe\n",
+                        bri->scl_gpiod);
+               return 0;
+       }
+
+       /*
+        * We have SCL. Pull SCL low and wait a bit so that SDA glitches
+        * have no effect.
+        */
+       gpiod_direction_output(bri->scl_gpiod, 0);
+       udelay(10);
+       bri->sda_gpiod = devm_gpiod_get(dev, "sda", GPIOD_OUT_HIGH_OPEN_DRAIN);
+
+       /* Wait a bit in case of a SDA glitch, and then release SCL. */
+       udelay(10);
+       gpiod_direction_output(bri->scl_gpiod, 1);
+
+       if (bri->sda_gpiod == ERR_PTR(-EPROBE_DEFER))
+               return -EPROBE_DEFER;
+
+       if (IS_ERR(bri->sda_gpiod)) {
+               dev_info(dev, "missing sda gpio recovery information: %pe\n",
+                        bri->sda_gpiod);
                return 0;
        }
-       if (IS_ERR(bri->pinctrl))
-               return PTR_ERR(bri->pinctrl);
 
        bri->prepare_recovery = i2c_pxa_prepare_recovery;
        bri->unprepare_recovery = i2c_pxa_unprepare_recovery;
+       bri->recover_bus = i2c_generic_scl_recovery;
 
        i2c->adap.bus_recovery_info = bri;
 
-       return 0;
+       /*
+        * Claiming GPIOs can change the pinmux state, which confuses the
+        * pinctrl since pinctrl's idea of the current setting is unaffected
+        * by the pinmux change caused by claiming the GPIO. Work around that
+        * by switching pinctrl to the GPIO state here. We do it this way to
+        * avoid glitching the I2C bus.
+        */
+       pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_recovery);
+
+       return pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_default);
 }
 
 static int i2c_pxa_probe(struct platform_device *dev)
index a8c89df..9a7a742 100644 (file)
@@ -2379,12 +2379,12 @@ retry_baser:
                break;
        }
 
+       if (!shr)
+               gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
+
        its_write_baser(its, baser, val);
        tmp = baser->val;
 
-       if (its->flags & ITS_FLAGS_FORCE_NON_SHAREABLE)
-               tmp &= ~GITS_BASER_SHAREABILITY_MASK;
-
        if ((val ^ tmp) & GITS_BASER_SHAREABILITY_MASK) {
                /*
                 * Shareability didn't stick. Just use
@@ -2394,10 +2394,9 @@ retry_baser:
                 * non-cacheable as well.
                 */
                shr = tmp & GITS_BASER_SHAREABILITY_MASK;
-               if (!shr) {
+               if (!shr)
                        cache = GITS_BASER_nC;
-                       gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order));
-               }
+
                goto retry_baser;
        }
 
@@ -2609,6 +2608,11 @@ static int its_alloc_tables(struct its_node *its)
                /* erratum 24313: ignore memory access type */
                cache = GITS_BASER_nCnB;
 
+       if (its->flags & ITS_FLAGS_FORCE_NON_SHAREABLE) {
+               cache = GITS_BASER_nC;
+               shr = 0;
+       }
+
        for (i = 0; i < GITS_BASER_NR_REGS; i++) {
                struct its_baser *baser = its->tables + i;
                u64 val = its_read_baser(its, baser);
index 62eb276..f03d7db 100644 (file)
@@ -254,7 +254,7 @@ enum evict_result {
 
 typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context);
 
-static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context)
+static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context, bool no_sleep)
 {
        unsigned long tested = 0;
        struct list_head *h = lru->cursor;
@@ -295,7 +295,8 @@ static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *con
 
                h = h->next;
 
-               cond_resched();
+               if (!no_sleep)
+                       cond_resched();
        }
 
        return NULL;
@@ -382,7 +383,10 @@ struct dm_buffer {
  */
 
 struct buffer_tree {
-       struct rw_semaphore lock;
+       union {
+               struct rw_semaphore lock;
+               rwlock_t spinlock;
+       } u;
        struct rb_root root;
 } ____cacheline_aligned_in_smp;
 
@@ -393,9 +397,12 @@ struct dm_buffer_cache {
         * on the locks.
         */
        unsigned int num_locks;
+       bool no_sleep;
        struct buffer_tree trees[];
 };
 
+static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
+
 static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
 {
        return dm_hash_locks_index(block, num_locks);
@@ -403,22 +410,34 @@ static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
 
 static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
 {
-       down_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
+       if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
+               read_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
+       else
+               down_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
 }
 
 static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
 {
-       up_read(&bc->trees[cache_index(block, bc->num_locks)].lock);
+       if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
+               read_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
+       else
+               up_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
 }
 
 static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
 {
-       down_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
+       if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
+               write_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
+       else
+               down_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
 }
 
 static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
 {
-       up_write(&bc->trees[cache_index(block, bc->num_locks)].lock);
+       if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep)
+               write_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock);
+       else
+               up_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock);
 }
 
 /*
@@ -442,18 +461,32 @@ static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool
 
 static void __lh_lock(struct lock_history *lh, unsigned int index)
 {
-       if (lh->write)
-               down_write(&lh->cache->trees[index].lock);
-       else
-               down_read(&lh->cache->trees[index].lock);
+       if (lh->write) {
+               if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
+                       write_lock_bh(&lh->cache->trees[index].u.spinlock);
+               else
+                       down_write(&lh->cache->trees[index].u.lock);
+       } else {
+               if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
+                       read_lock_bh(&lh->cache->trees[index].u.spinlock);
+               else
+                       down_read(&lh->cache->trees[index].u.lock);
+       }
 }
 
 static void __lh_unlock(struct lock_history *lh, unsigned int index)
 {
-       if (lh->write)
-               up_write(&lh->cache->trees[index].lock);
-       else
-               up_read(&lh->cache->trees[index].lock);
+       if (lh->write) {
+               if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
+                       write_unlock_bh(&lh->cache->trees[index].u.spinlock);
+               else
+                       up_write(&lh->cache->trees[index].u.lock);
+       } else {
+               if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep)
+                       read_unlock_bh(&lh->cache->trees[index].u.spinlock);
+               else
+                       up_read(&lh->cache->trees[index].u.lock);
+       }
 }
 
 /*
@@ -502,14 +535,18 @@ static struct dm_buffer *list_to_buffer(struct list_head *l)
        return le_to_buffer(le);
 }
 
-static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks)
+static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks, bool no_sleep)
 {
        unsigned int i;
 
        bc->num_locks = num_locks;
+       bc->no_sleep = no_sleep;
 
        for (i = 0; i < bc->num_locks; i++) {
-               init_rwsem(&bc->trees[i].lock);
+               if (no_sleep)
+                       rwlock_init(&bc->trees[i].u.spinlock);
+               else
+                       init_rwsem(&bc->trees[i].u.lock);
                bc->trees[i].root = RB_ROOT;
        }
 
@@ -648,7 +685,7 @@ static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode
        struct lru_entry *le;
        struct dm_buffer *b;
 
-       le = lru_evict(&bc->lru[list_mode], __evict_pred, &w);
+       le = lru_evict(&bc->lru[list_mode], __evict_pred, &w, bc->no_sleep);
        if (!le)
                return NULL;
 
@@ -702,7 +739,7 @@ static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_
        struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
 
        while (true) {
-               le = lru_evict(&bc->lru[old_mode], __evict_pred, &w);
+               le = lru_evict(&bc->lru[old_mode], __evict_pred, &w, bc->no_sleep);
                if (!le)
                        break;
 
@@ -915,10 +952,11 @@ static void cache_remove_range(struct dm_buffer_cache *bc,
 {
        unsigned int i;
 
+       BUG_ON(bc->no_sleep);
        for (i = 0; i < bc->num_locks; i++) {
-               down_write(&bc->trees[i].lock);
+               down_write(&bc->trees[i].u.lock);
                __remove_range(bc, &bc->trees[i].root, begin, end, pred, release);
-               up_write(&bc->trees[i].lock);
+               up_write(&bc->trees[i].u.lock);
        }
 }
 
@@ -979,8 +1017,6 @@ struct dm_bufio_client {
        struct dm_buffer_cache cache; /* must be last member */
 };
 
-static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
-
 /*----------------------------------------------------------------*/
 
 #define dm_bufio_in_request()  (!!current->bio_list)
@@ -1871,7 +1907,8 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
        if (need_submit)
                submit_io(b, REQ_OP_READ, read_endio);
 
-       wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
+       if (nf != NF_GET)       /* we already tested this condition above */
+               wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
 
        if (b->read_error) {
                int error = blk_status_to_errno(b->read_error);
@@ -2421,7 +2458,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
                r = -ENOMEM;
                goto bad_client;
        }
-       cache_init(&c->cache, num_locks);
+       cache_init(&c->cache, num_locks, (flags & DM_BUFIO_CLIENT_NO_SLEEP) != 0);
 
        c->bdev = bdev;
        c->block_size = block_size;
index 6de107a..2ae8560 100644 (file)
@@ -1673,7 +1673,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size)
        unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
        unsigned int remaining_size;
-       unsigned int order = MAX_ORDER - 1;
+       unsigned int order = MAX_ORDER;
 
 retry:
        if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
index efd5109..5eabdb0 100644 (file)
@@ -33,7 +33,7 @@ struct delay_c {
        struct work_struct flush_expired_bios;
        struct list_head delayed_bios;
        struct task_struct *worker;
-       atomic_t may_delay;
+       bool may_delay;
 
        struct delay_class read;
        struct delay_class write;
@@ -73,39 +73,6 @@ static inline bool delay_is_fast(struct delay_c *dc)
        return !!dc->worker;
 }
 
-static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all)
-{
-       struct dm_delay_info *delayed, *next;
-
-       mutex_lock(&delayed_bios_lock);
-       list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
-               if (flush_all || time_after_eq(jiffies, delayed->expires)) {
-                       struct bio *bio = dm_bio_from_per_bio_data(delayed,
-                                               sizeof(struct dm_delay_info));
-                       list_del(&delayed->list);
-                       dm_submit_bio_remap(bio, NULL);
-                       delayed->class->ops--;
-               }
-       }
-       mutex_unlock(&delayed_bios_lock);
-}
-
-static int flush_worker_fn(void *data)
-{
-       struct delay_c *dc = data;
-
-       while (1) {
-               flush_delayed_bios_fast(dc, false);
-               if (unlikely(list_empty(&dc->delayed_bios))) {
-                       set_current_state(TASK_INTERRUPTIBLE);
-                       schedule();
-               } else
-                       cond_resched();
-       }
-
-       return 0;
-}
-
 static void flush_bios(struct bio *bio)
 {
        struct bio *n;
@@ -118,36 +85,61 @@ static void flush_bios(struct bio *bio)
        }
 }
 
-static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all)
+static void flush_delayed_bios(struct delay_c *dc, bool flush_all)
 {
        struct dm_delay_info *delayed, *next;
+       struct bio_list flush_bio_list;
        unsigned long next_expires = 0;
-       unsigned long start_timer = 0;
-       struct bio_list flush_bios = { };
+       bool start_timer = false;
+       bio_list_init(&flush_bio_list);
 
        mutex_lock(&delayed_bios_lock);
        list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
+               cond_resched();
                if (flush_all || time_after_eq(jiffies, delayed->expires)) {
                        struct bio *bio = dm_bio_from_per_bio_data(delayed,
                                                sizeof(struct dm_delay_info));
                        list_del(&delayed->list);
-                       bio_list_add(&flush_bios, bio);
+                       bio_list_add(&flush_bio_list, bio);
                        delayed->class->ops--;
                        continue;
                }
 
-               if (!start_timer) {
-                       start_timer = 1;
-                       next_expires = delayed->expires;
-               } else
-                       next_expires = min(next_expires, delayed->expires);
+               if (!delay_is_fast(dc)) {
+                       if (!start_timer) {
+                               start_timer = true;
+                               next_expires = delayed->expires;
+                       } else {
+                               next_expires = min(next_expires, delayed->expires);
+                       }
+               }
        }
        mutex_unlock(&delayed_bios_lock);
 
        if (start_timer)
                queue_timeout(dc, next_expires);
 
-       return bio_list_get(&flush_bios);
+       flush_bios(bio_list_get(&flush_bio_list));
+}
+
+static int flush_worker_fn(void *data)
+{
+       struct delay_c *dc = data;
+
+       while (!kthread_should_stop()) {
+               flush_delayed_bios(dc, false);
+               mutex_lock(&delayed_bios_lock);
+               if (unlikely(list_empty(&dc->delayed_bios))) {
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       mutex_unlock(&delayed_bios_lock);
+                       schedule();
+               } else {
+                       mutex_unlock(&delayed_bios_lock);
+                       cond_resched();
+               }
+       }
+
+       return 0;
 }
 
 static void flush_expired_bios(struct work_struct *work)
@@ -155,10 +147,7 @@ static void flush_expired_bios(struct work_struct *work)
        struct delay_c *dc;
 
        dc = container_of(work, struct delay_c, flush_expired_bios);
-       if (delay_is_fast(dc))
-               flush_delayed_bios_fast(dc, false);
-       else
-               flush_bios(flush_delayed_bios(dc, false));
+       flush_delayed_bios(dc, false);
 }
 
 static void delay_dtr(struct dm_target *ti)
@@ -177,8 +166,7 @@ static void delay_dtr(struct dm_target *ti)
        if (dc->worker)
                kthread_stop(dc->worker);
 
-       if (!delay_is_fast(dc))
-               mutex_destroy(&dc->timer_lock);
+       mutex_destroy(&dc->timer_lock);
 
        kfree(dc);
 }
@@ -236,7 +224,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
        ti->private = dc;
        INIT_LIST_HEAD(&dc->delayed_bios);
-       atomic_set(&dc->may_delay, 1);
+       mutex_init(&dc->timer_lock);
+       dc->may_delay = true;
        dc->argc = argc;
 
        ret = delay_class_ctr(ti, &dc->read, argv);
@@ -282,12 +271,12 @@ out:
                                            "dm-delay-flush-worker");
                if (IS_ERR(dc->worker)) {
                        ret = PTR_ERR(dc->worker);
+                       dc->worker = NULL;
                        goto bad;
                }
        } else {
                timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
                INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
-               mutex_init(&dc->timer_lock);
                dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
                if (!dc->kdelayd_wq) {
                        ret = -EINVAL;
@@ -312,7 +301,7 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
        struct dm_delay_info *delayed;
        unsigned long expires = 0;
 
-       if (!c->delay || !atomic_read(&dc->may_delay))
+       if (!c->delay)
                return DM_MAPIO_REMAPPED;
 
        delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
@@ -321,6 +310,10 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
        delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay);
 
        mutex_lock(&delayed_bios_lock);
+       if (unlikely(!dc->may_delay)) {
+               mutex_unlock(&delayed_bios_lock);
+               return DM_MAPIO_REMAPPED;
+       }
        c->ops++;
        list_add_tail(&delayed->list, &dc->delayed_bios);
        mutex_unlock(&delayed_bios_lock);
@@ -337,21 +330,20 @@ static void delay_presuspend(struct dm_target *ti)
 {
        struct delay_c *dc = ti->private;
 
-       atomic_set(&dc->may_delay, 0);
+       mutex_lock(&delayed_bios_lock);
+       dc->may_delay = false;
+       mutex_unlock(&delayed_bios_lock);
 
-       if (delay_is_fast(dc))
-               flush_delayed_bios_fast(dc, true);
-       else {
+       if (!delay_is_fast(dc))
                del_timer_sync(&dc->delay_timer);
-               flush_bios(flush_delayed_bios(dc, true));
-       }
+       flush_delayed_bios(dc, true);
 }
 
 static void delay_resume(struct dm_target *ti)
 {
        struct delay_c *dc = ti->private;
 
-       atomic_set(&dc->may_delay, 1);
+       dc->may_delay = true;
 }
 
 static int delay_map(struct dm_target *ti, struct bio *bio)
index 3ef9f01..2099c75 100644 (file)
@@ -185,7 +185,7 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
 {
        if (unlikely(verity_hash(v, verity_io_hash_req(v, io),
                                 data, 1 << v->data_dev_block_bits,
-                                verity_io_real_digest(v, io))))
+                                verity_io_real_digest(v, io), true)))
                return 0;
 
        return memcmp(verity_io_real_digest(v, io), want_digest,
@@ -386,7 +386,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
        /* Always re-validate the corrected block against the expected hash */
        r = verity_hash(v, verity_io_hash_req(v, io), fio->output,
                        1 << v->data_dev_block_bits,
-                       verity_io_real_digest(v, io));
+                       verity_io_real_digest(v, io), true);
        if (unlikely(r < 0))
                return r;
 
index 26adcfe..e115fcf 100644 (file)
@@ -135,20 +135,21 @@ static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
  * Wrapper for crypto_ahash_init, which handles verity salting.
  */
 static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
-                               struct crypto_wait *wait)
+                               struct crypto_wait *wait, bool may_sleep)
 {
        int r;
 
        ahash_request_set_tfm(req, v->tfm);
-       ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
-                                       CRYPTO_TFM_REQ_MAY_BACKLOG,
-                                       crypto_req_done, (void *)wait);
+       ahash_request_set_callback(req,
+               may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0,
+               crypto_req_done, (void *)wait);
        crypto_init_wait(wait);
 
        r = crypto_wait_req(crypto_ahash_init(req), wait);
 
        if (unlikely(r < 0)) {
-               DMERR("crypto_ahash_init failed: %d", r);
+               if (r != -ENOMEM)
+                       DMERR("crypto_ahash_init failed: %d", r);
                return r;
        }
 
@@ -179,12 +180,12 @@ out:
 }
 
 int verity_hash(struct dm_verity *v, struct ahash_request *req,
-               const u8 *data, size_t len, u8 *digest)
+               const u8 *data, size_t len, u8 *digest, bool may_sleep)
 {
        int r;
        struct crypto_wait wait;
 
-       r = verity_hash_init(v, req, &wait);
+       r = verity_hash_init(v, req, &wait, may_sleep);
        if (unlikely(r < 0))
                goto out;
 
@@ -322,7 +323,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 
                r = verity_hash(v, verity_io_hash_req(v, io),
                                data, 1 << v->hash_dev_block_bits,
-                               verity_io_real_digest(v, io));
+                               verity_io_real_digest(v, io), !io->in_tasklet);
                if (unlikely(r < 0))
                        goto release_ret_r;
 
@@ -556,7 +557,7 @@ static int verity_verify_io(struct dm_verity_io *io)
                        continue;
                }
 
-               r = verity_hash_init(v, req, &wait);
+               r = verity_hash_init(v, req, &wait, !io->in_tasklet);
                if (unlikely(r < 0))
                        return r;
 
@@ -652,7 +653,7 @@ static void verity_tasklet(unsigned long data)
 
        io->in_tasklet = true;
        err = verity_verify_io(io);
-       if (err == -EAGAIN) {
+       if (err == -EAGAIN || err == -ENOMEM) {
                /* fallback to retrying with work-queue */
                INIT_WORK(&io->work, verity_work);
                queue_work(io->v->verify_wq, &io->work);
@@ -1033,7 +1034,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
                goto out;
 
        r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits,
-                       v->zero_digest);
+                       v->zero_digest, true);
 
 out:
        kfree(req);
index 2f555b4..f96f4e2 100644 (file)
@@ -128,7 +128,7 @@ extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
                                              u8 *data, size_t len));
 
 extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
-                      const u8 *data, size_t len, u8 *digest);
+                      const u8 *data, size_t len, u8 *digest, bool may_sleep);
 
 extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
                                 sector_t block, u8 *digest, bool *is_zero);
index 51d47ed..8e6cc0e 100644 (file)
@@ -1500,6 +1500,10 @@ done:
 static void bond_setup_by_slave(struct net_device *bond_dev,
                                struct net_device *slave_dev)
 {
+       bool was_up = !!(bond_dev->flags & IFF_UP);
+
+       dev_close(bond_dev);
+
        bond_dev->header_ops        = slave_dev->header_ops;
 
        bond_dev->type              = slave_dev->type;
@@ -1514,6 +1518,8 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
                bond_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST);
                bond_dev->flags |= (IFF_POINTOPOINT | IFF_NOARP);
        }
+       if (was_up)
+               dev_open(bond_dev, NULL);
 }
 
 /* On bonding slaves other than the currently active slave, suppress
index 045fe13..5beadab 100644 (file)
@@ -146,7 +146,7 @@ irqreturn_t pdsc_adminq_isr(int irq, void *data)
        }
 
        queue_work(pdsc->wq, &qcq->work);
-       pds_core_intr_mask(&pdsc->intr_ctrl[irq], PDS_CORE_INTR_MASK_CLEAR);
+       pds_core_intr_mask(&pdsc->intr_ctrl[qcq->intx], PDS_CORE_INTR_MASK_CLEAR);
 
        return IRQ_HANDLED;
 }
index f3a7ded..e35d3e7 100644 (file)
@@ -15,7 +15,7 @@
 #define PDSC_DRV_DESCRIPTION   "AMD/Pensando Core Driver"
 
 #define PDSC_WATCHDOG_SECS     5
-#define PDSC_QUEUE_NAME_MAX_SZ  32
+#define PDSC_QUEUE_NAME_MAX_SZ  16
 #define PDSC_ADMINQ_MIN_LENGTH 16      /* must be a power of two */
 #define PDSC_NOTIFYQ_LENGTH    64      /* must be a power of two */
 #define PDSC_TEARDOWN_RECOVERY false
index 7c1b965..31940b8 100644 (file)
@@ -261,10 +261,14 @@ static int pdsc_identify(struct pdsc *pdsc)
        struct pds_core_drv_identity drv = {};
        size_t sz;
        int err;
+       int n;
 
        drv.drv_type = cpu_to_le32(PDS_DRIVER_LINUX);
-       snprintf(drv.driver_ver_str, sizeof(drv.driver_ver_str),
-                "%s %s", PDS_CORE_DRV_NAME, utsname()->release);
+       /* Catching the return quiets a Wformat-truncation complaint */
+       n = snprintf(drv.driver_ver_str, sizeof(drv.driver_ver_str),
+                    "%s %s", PDS_CORE_DRV_NAME, utsname()->release);
+       if (n > sizeof(drv.driver_ver_str))
+               dev_dbg(pdsc->dev, "release name truncated, don't care\n");
 
        /* Next let's get some info about the device
         * We use the devcmd_lock at this level in order to
index 57f88c8..e9948ea 100644 (file)
@@ -104,7 +104,7 @@ int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
        struct pds_core_fw_list_info fw_list;
        struct pdsc *pdsc = devlink_priv(dl);
        union pds_core_dev_comp comp;
-       char buf[16];
+       char buf[32];
        int listlen;
        int err;
        int i;
index 1dee273..48b6191 100644 (file)
@@ -6889,7 +6889,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
                                       desc_idx, *post_ptr);
                drop_it_no_recycle:
                        /* Other statistics kept track of by card. */
-                       tp->rx_dropped++;
+                       tnapi->rx_dropped++;
                        goto next_pkt;
                }
 
@@ -7918,8 +7918,10 @@ static int tg3_tso_bug(struct tg3 *tp, struct tg3_napi *tnapi,
 
        segs = skb_gso_segment(skb, tp->dev->features &
                                    ~(NETIF_F_TSO | NETIF_F_TSO6));
-       if (IS_ERR(segs) || !segs)
+       if (IS_ERR(segs) || !segs) {
+               tnapi->tx_dropped++;
                goto tg3_tso_bug_end;
+       }
 
        skb_list_walk_safe(segs, seg, next) {
                skb_mark_not_on_list(seg);
@@ -8190,7 +8192,7 @@ dma_error:
 drop:
        dev_kfree_skb_any(skb);
 drop_nofree:
-       tp->tx_dropped++;
+       tnapi->tx_dropped++;
        return NETDEV_TX_OK;
 }
 
@@ -9405,7 +9407,7 @@ static void __tg3_set_rx_mode(struct net_device *);
 /* tp->lock is held. */
 static int tg3_halt(struct tg3 *tp, int kind, bool silent)
 {
-       int err;
+       int err, i;
 
        tg3_stop_fw(tp);
 
@@ -9426,6 +9428,13 @@ static int tg3_halt(struct tg3 *tp, int kind, bool silent)
 
                /* And make sure the next sample is new data */
                memset(tp->hw_stats, 0, sizeof(struct tg3_hw_stats));
+
+               for (i = 0; i < TG3_IRQ_MAX_VECS; ++i) {
+                       struct tg3_napi *tnapi = &tp->napi[i];
+
+                       tnapi->rx_dropped = 0;
+                       tnapi->tx_dropped = 0;
+               }
        }
 
        return err;
@@ -11975,6 +11984,9 @@ static void tg3_get_nstats(struct tg3 *tp, struct rtnl_link_stats64 *stats)
 {
        struct rtnl_link_stats64 *old_stats = &tp->net_stats_prev;
        struct tg3_hw_stats *hw_stats = tp->hw_stats;
+       unsigned long rx_dropped;
+       unsigned long tx_dropped;
+       int i;
 
        stats->rx_packets = old_stats->rx_packets +
                get_stat64(&hw_stats->rx_ucast_packets) +
@@ -12021,8 +12033,26 @@ static void tg3_get_nstats(struct tg3 *tp, struct rtnl_link_stats64 *stats)
        stats->rx_missed_errors = old_stats->rx_missed_errors +
                get_stat64(&hw_stats->rx_discards);
 
-       stats->rx_dropped = tp->rx_dropped;
-       stats->tx_dropped = tp->tx_dropped;
+       /* Aggregate per-queue counters. The per-queue counters are updated
+        * by a single writer, race-free. The result computed by this loop
+        * might not be 100% accurate (counters can be updated in the middle of
+        * the loop) but the next tg3_get_nstats() will recompute the current
+        * value so it is acceptable.
+        *
+        * Note that these counters wrap around at 4G on 32bit machines.
+        */
+       rx_dropped = (unsigned long)(old_stats->rx_dropped);
+       tx_dropped = (unsigned long)(old_stats->tx_dropped);
+
+       for (i = 0; i < tp->irq_cnt; i++) {
+               struct tg3_napi *tnapi = &tp->napi[i];
+
+               rx_dropped += tnapi->rx_dropped;
+               tx_dropped += tnapi->tx_dropped;
+       }
+
+       stats->rx_dropped = rx_dropped;
+       stats->tx_dropped = tx_dropped;
 }
 
 static int tg3_get_regs_len(struct net_device *dev)
index ae5c01b..5016475 100644 (file)
@@ -3018,6 +3018,7 @@ struct tg3_napi {
        u16                             *rx_rcb_prod_idx;
        struct tg3_rx_prodring_set      prodring;
        struct tg3_rx_buffer_desc       *rx_rcb;
+       unsigned long                   rx_dropped;
 
        u32                             tx_prod ____cacheline_aligned;
        u32                             tx_cons;
@@ -3026,6 +3027,7 @@ struct tg3_napi {
        u32                             prodmbox;
        struct tg3_tx_buffer_desc       *tx_ring;
        struct tg3_tx_ring_info         *tx_buffers;
+       unsigned long                   tx_dropped;
 
        dma_addr_t                      status_mapping;
        dma_addr_t                      rx_rcb_mapping;
@@ -3220,8 +3222,6 @@ struct tg3 {
 
 
        /* begin "everything else" cacheline(s) section */
-       unsigned long                   rx_dropped;
-       unsigned long                   tx_dropped;
        struct rtnl_link_stats64        net_stats_prev;
        struct tg3_ethtool_stats        estats_prev;
 
index 5423fe2..78287cf 100644 (file)
@@ -432,8 +432,8 @@ static const struct gmac_max_framelen gmac_maxlens[] = {
                .val = CONFIG0_MAXLEN_1536,
        },
        {
-               .max_l3_len = 1542,
-               .val = CONFIG0_MAXLEN_1542,
+               .max_l3_len = 1548,
+               .val = CONFIG0_MAXLEN_1548,
        },
        {
                .max_l3_len = 9212,
@@ -1145,6 +1145,7 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb,
        dma_addr_t mapping;
        unsigned short mtu;
        void *buffer;
+       int ret;
 
        mtu  = ETH_HLEN;
        mtu += netdev->mtu;
@@ -1159,9 +1160,30 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb,
                word3 |= mtu;
        }
 
-       if (skb->ip_summed != CHECKSUM_NONE) {
+       if (skb->len >= ETH_FRAME_LEN) {
+               /* Hardware offloaded checksumming isn't working on frames
+                * bigger than 1514 bytes. A hypothesis about this is that the
+                * checksum buffer is only 1518 bytes, so when the frames get
+                * bigger they get truncated, or the last few bytes get
+                * overwritten by the FCS.
+                *
+                * Just use software checksumming and bypass on bigger frames.
+                */
+               if (skb->ip_summed == CHECKSUM_PARTIAL) {
+                       ret = skb_checksum_help(skb);
+                       if (ret)
+                               return ret;
+               }
+               word1 |= TSS_BYPASS_BIT;
+       } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                int tcp = 0;
 
+               /* We do not switch off the checksumming on non TCP/UDP
+                * frames: as is shown from tests, the checksumming engine
+                * is smart enough to see that a frame is not actually TCP
+                * or UDP and then just pass it through without any changes
+                * to the frame.
+                */
                if (skb->protocol == htons(ETH_P_IP)) {
                        word1 |= TSS_IP_CHKSUM_BIT;
                        tcp = ip_hdr(skb)->protocol == IPPROTO_TCP;
@@ -1978,15 +2000,6 @@ static int gmac_change_mtu(struct net_device *netdev, int new_mtu)
        return 0;
 }
 
-static netdev_features_t gmac_fix_features(struct net_device *netdev,
-                                          netdev_features_t features)
-{
-       if (netdev->mtu + ETH_HLEN + VLAN_HLEN > MTU_SIZE_BIT_MASK)
-               features &= ~GMAC_OFFLOAD_FEATURES;
-
-       return features;
-}
-
 static int gmac_set_features(struct net_device *netdev,
                             netdev_features_t features)
 {
@@ -2212,7 +2225,6 @@ static const struct net_device_ops gmac_351x_ops = {
        .ndo_set_mac_address    = gmac_set_mac_address,
        .ndo_get_stats64        = gmac_get_stats64,
        .ndo_change_mtu         = gmac_change_mtu,
-       .ndo_fix_features       = gmac_fix_features,
        .ndo_set_features       = gmac_set_features,
 };
 
@@ -2464,11 +2476,12 @@ static int gemini_ethernet_port_probe(struct platform_device *pdev)
 
        netdev->hw_features = GMAC_OFFLOAD_FEATURES;
        netdev->features |= GMAC_OFFLOAD_FEATURES | NETIF_F_GRO;
-       /* We can handle jumbo frames up to 10236 bytes so, let's accept
-        * payloads of 10236 bytes minus VLAN and ethernet header
+       /* We can receive jumbo frames up to 10236 bytes but only
+        * transmit 2047 bytes so, let's accept payloads of 2047
+        * bytes minus VLAN and ethernet header
         */
        netdev->min_mtu = ETH_MIN_MTU;
-       netdev->max_mtu = 10236 - VLAN_ETH_HLEN;
+       netdev->max_mtu = MTU_SIZE_BIT_MASK - VLAN_ETH_HLEN;
 
        port->freeq_refill = 0;
        netif_napi_add(netdev, &port->napi, gmac_napi_poll);
index 9fdf77d..24bb989 100644 (file)
@@ -502,7 +502,7 @@ union gmac_txdesc_3 {
 #define SOF_BIT                        0x80000000
 #define EOF_BIT                        0x40000000
 #define EOFIE_BIT              BIT(29)
-#define MTU_SIZE_BIT_MASK      0x1fff
+#define MTU_SIZE_BIT_MASK      0x7ff /* Max MTU 2047 bytes */
 
 /* GMAC Tx Descriptor */
 struct gmac_txdesc {
@@ -787,7 +787,7 @@ union gmac_config0 {
 #define  CONFIG0_MAXLEN_1536   0
 #define  CONFIG0_MAXLEN_1518   1
 #define  CONFIG0_MAXLEN_1522   2
-#define  CONFIG0_MAXLEN_1542   3
+#define  CONFIG0_MAXLEN_1548   3
 #define  CONFIG0_MAXLEN_9k     4       /* 9212 */
 #define  CONFIG0_MAXLEN_10k    5       /* 10236 */
 #define  CONFIG0_MAXLEN_1518__6        6
index 276f996..2d42e73 100644 (file)
@@ -254,10 +254,13 @@ static int gve_napi_poll(struct napi_struct *napi, int budget)
        if (block->tx) {
                if (block->tx->q_num < priv->tx_cfg.num_queues)
                        reschedule |= gve_tx_poll(block, budget);
-               else
+               else if (budget)
                        reschedule |= gve_xdp_poll(block, budget);
        }
 
+       if (!budget)
+               return 0;
+
        if (block->rx) {
                work_done = gve_rx_poll(block, budget);
                reschedule |= work_done == budget;
@@ -298,6 +301,9 @@ static int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
        if (block->tx)
                reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true);
 
+       if (!budget)
+               return 0;
+
        if (block->rx) {
                work_done = gve_rx_poll_dqo(block, budget);
                reschedule |= work_done == budget;
index e84a066..7365534 100644 (file)
@@ -1007,10 +1007,6 @@ int gve_rx_poll(struct gve_notify_block *block, int budget)
 
        feat = block->napi.dev->features;
 
-       /* If budget is 0, do all the work */
-       if (budget == 0)
-               budget = INT_MAX;
-
        if (budget > 0)
                work_done = gve_clean_rx_done(rx, budget, feat);
 
index 6957a86..9f6ffc4 100644 (file)
@@ -925,10 +925,6 @@ bool gve_xdp_poll(struct gve_notify_block *block, int budget)
        bool repoll;
        u32 to_do;
 
-       /* If budget is 0, do all the work */
-       if (budget == 0)
-               budget = INT_MAX;
-
        /* Find out how much work there is to be done */
        nic_done = gve_tx_load_event_counter(priv, tx);
        to_do = min_t(u32, (nic_done - tx->done), budget);
index 0b13863..c083d1d 100644 (file)
@@ -503,11 +503,14 @@ static void hns3_get_coal_info(struct hns3_enet_tqp_vector *tqp_vector,
        }
 
        sprintf(result[j++], "%d", i);
-       sprintf(result[j++], "%s", dim_state_str[dim->state]);
+       sprintf(result[j++], "%s", dim->state < ARRAY_SIZE(dim_state_str) ?
+               dim_state_str[dim->state] : "unknown");
        sprintf(result[j++], "%u", dim->profile_ix);
-       sprintf(result[j++], "%s", dim_cqe_mode_str[dim->mode]);
+       sprintf(result[j++], "%s", dim->mode < ARRAY_SIZE(dim_cqe_mode_str) ?
+               dim_cqe_mode_str[dim->mode] : "unknown");
        sprintf(result[j++], "%s",
-               dim_tune_stat_str[dim->tune_state]);
+               dim->tune_state < ARRAY_SIZE(dim_tune_stat_str) ?
+               dim_tune_stat_str[dim->tune_state] : "unknown");
        sprintf(result[j++], "%u", dim->steps_left);
        sprintf(result[j++], "%u", dim->steps_right);
        sprintf(result[j++], "%u", dim->tired);
index 0611750..b618797 100644 (file)
@@ -5139,7 +5139,7 @@ static int hns3_init_mac_addr(struct net_device *netdev)
        struct hns3_nic_priv *priv = netdev_priv(netdev);
        char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN];
        struct hnae3_handle *h = priv->ae_handle;
-       u8 mac_addr_temp[ETH_ALEN];
+       u8 mac_addr_temp[ETH_ALEN] = {0};
        int ret = 0;
 
        if (h->ae_algo->ops->get_mac_addr)
index 66e5807..5ea9e59 100644 (file)
@@ -61,6 +61,7 @@ static void hclge_sync_fd_table(struct hclge_dev *hdev);
 static void hclge_update_fec_stats(struct hclge_dev *hdev);
 static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret,
                                      int wait_cnt);
+static int hclge_update_port_info(struct hclge_dev *hdev);
 
 static struct hnae3_ae_algo ae_algo;
 
@@ -3041,6 +3042,9 @@ static void hclge_update_link_status(struct hclge_dev *hdev)
 
        if (state != hdev->hw.mac.link) {
                hdev->hw.mac.link = state;
+               if (state == HCLGE_LINK_STATUS_UP)
+                       hclge_update_port_info(hdev);
+
                client->ops->link_status_change(handle, state);
                hclge_config_mac_tnl_int(hdev, state);
                if (rclient && rclient->ops->link_status_change)
@@ -10025,8 +10029,6 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id,
        struct hclge_vport_vlan_cfg *vlan, *tmp;
        struct hclge_dev *hdev = vport->back;
 
-       mutex_lock(&hdev->vport_lock);
-
        list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) {
                if (vlan->vlan_id == vlan_id) {
                        if (is_write_tbl && vlan->hd_tbl_status)
@@ -10041,8 +10043,6 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id,
                        break;
                }
        }
-
-       mutex_unlock(&hdev->vport_lock);
 }
 
 void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list)
@@ -10451,11 +10451,16 @@ int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto,
         * handle mailbox. Just record the vlan id, and remove it after
         * reset finished.
         */
+       mutex_lock(&hdev->vport_lock);
        if ((test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
             test_bit(HCLGE_STATE_RST_FAIL, &hdev->state)) && is_kill) {
                set_bit(vlan_id, vport->vlan_del_fail_bmap);
+               mutex_unlock(&hdev->vport_lock);
                return -EBUSY;
+       } else if (!is_kill && test_bit(vlan_id, vport->vlan_del_fail_bmap)) {
+               clear_bit(vlan_id, vport->vlan_del_fail_bmap);
        }
+       mutex_unlock(&hdev->vport_lock);
 
        /* when port base vlan enabled, we use port base vlan as the vlan
         * filter entry. In this case, we don't update vlan filter table
@@ -10470,17 +10475,22 @@ int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto,
        }
 
        if (!ret) {
-               if (!is_kill)
+               if (!is_kill) {
                        hclge_add_vport_vlan_table(vport, vlan_id,
                                                   writen_to_tbl);
-               else if (is_kill && vlan_id != 0)
+               } else if (is_kill && vlan_id != 0) {
+                       mutex_lock(&hdev->vport_lock);
                        hclge_rm_vport_vlan_table(vport, vlan_id, false);
+                       mutex_unlock(&hdev->vport_lock);
+               }
        } else if (is_kill) {
                /* when remove hw vlan filter failed, record the vlan id,
                 * and try to remove it from hw later, to be consistence
                 * with stack
                 */
+               mutex_lock(&hdev->vport_lock);
                set_bit(vlan_id, vport->vlan_del_fail_bmap);
+               mutex_unlock(&hdev->vport_lock);
        }
 
        hclge_set_vport_vlan_fltr_change(vport);
@@ -10520,6 +10530,7 @@ static void hclge_sync_vlan_filter(struct hclge_dev *hdev)
        int i, ret, sync_cnt = 0;
        u16 vlan_id;
 
+       mutex_lock(&hdev->vport_lock);
        /* start from vport 1 for PF is always alive */
        for (i = 0; i < hdev->num_alloc_vport; i++) {
                struct hclge_vport *vport = &hdev->vport[i];
@@ -10530,21 +10541,26 @@ static void hclge_sync_vlan_filter(struct hclge_dev *hdev)
                        ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q),
                                                       vport->vport_id, vlan_id,
                                                       true);
-                       if (ret && ret != -EINVAL)
+                       if (ret && ret != -EINVAL) {
+                               mutex_unlock(&hdev->vport_lock);
                                return;
+                       }
 
                        clear_bit(vlan_id, vport->vlan_del_fail_bmap);
                        hclge_rm_vport_vlan_table(vport, vlan_id, false);
                        hclge_set_vport_vlan_fltr_change(vport);
 
                        sync_cnt++;
-                       if (sync_cnt >= HCLGE_MAX_SYNC_COUNT)
+                       if (sync_cnt >= HCLGE_MAX_SYNC_COUNT) {
+                               mutex_unlock(&hdev->vport_lock);
                                return;
+                       }
 
                        vlan_id = find_first_bit(vport->vlan_del_fail_bmap,
                                                 VLAN_N_VID);
                }
        }
+       mutex_unlock(&hdev->vport_lock);
 
        hclge_sync_vlan_fltr_state(hdev);
 }
@@ -11651,6 +11667,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
                goto err_msi_irq_uninit;
 
        if (hdev->hw.mac.media_type == HNAE3_MEDIA_TYPE_COPPER) {
+               clear_bit(HNAE3_DEV_SUPPORT_FEC_B, ae_dev->caps);
                if (hnae3_dev_phy_imp_supported(hdev))
                        ret = hclge_update_tp_port_info(hdev);
                else
index a4d68fb..0aa9bee 100644 (file)
@@ -1206,6 +1206,8 @@ static int hclgevf_set_vlan_filter(struct hnae3_handle *handle,
             test_bit(HCLGEVF_STATE_RST_FAIL, &hdev->state)) && is_kill) {
                set_bit(vlan_id, hdev->vlan_del_fail_bmap);
                return -EBUSY;
+       } else if (!is_kill && test_bit(vlan_id, hdev->vlan_del_fail_bmap)) {
+               clear_bit(vlan_id, hdev->vlan_del_fail_bmap);
        }
 
        hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_VLAN,
@@ -1233,20 +1235,25 @@ static void hclgevf_sync_vlan_filter(struct hclgevf_dev *hdev)
        int ret, sync_cnt = 0;
        u16 vlan_id;
 
+       if (bitmap_empty(hdev->vlan_del_fail_bmap, VLAN_N_VID))
+               return;
+
+       rtnl_lock();
        vlan_id = find_first_bit(hdev->vlan_del_fail_bmap, VLAN_N_VID);
        while (vlan_id != VLAN_N_VID) {
                ret = hclgevf_set_vlan_filter(handle, htons(ETH_P_8021Q),
                                              vlan_id, true);
                if (ret)
-                       return;
+                       break;
 
                clear_bit(vlan_id, hdev->vlan_del_fail_bmap);
                sync_cnt++;
                if (sync_cnt >= HCLGEVF_MAX_SYNC_COUNT)
-                       return;
+                       break;
 
                vlan_id = find_first_bit(hdev->vlan_del_fail_bmap, VLAN_N_VID);
        }
+       rtnl_unlock();
 }
 
 static int hclgevf_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable)
@@ -1974,8 +1981,18 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev,
        return HCLGEVF_VECTOR0_EVENT_OTHER;
 }
 
+static void hclgevf_reset_timer(struct timer_list *t)
+{
+       struct hclgevf_dev *hdev = from_timer(hdev, t, reset_timer);
+
+       hclgevf_clear_event_cause(hdev, HCLGEVF_VECTOR0_EVENT_RST);
+       hclgevf_reset_task_schedule(hdev);
+}
+
 static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data)
 {
+#define HCLGEVF_RESET_DELAY    5
+
        enum hclgevf_evt_cause event_cause;
        struct hclgevf_dev *hdev = data;
        u32 clearval;
@@ -1987,7 +2004,8 @@ static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data)
 
        switch (event_cause) {
        case HCLGEVF_VECTOR0_EVENT_RST:
-               hclgevf_reset_task_schedule(hdev);
+               mod_timer(&hdev->reset_timer,
+                         jiffies + msecs_to_jiffies(HCLGEVF_RESET_DELAY));
                break;
        case HCLGEVF_VECTOR0_EVENT_MBX:
                hclgevf_mbx_handler(hdev);
@@ -2930,6 +2948,7 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
                 HCLGEVF_DRIVER_NAME);
 
        hclgevf_task_schedule(hdev, round_jiffies_relative(HZ));
+       timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0);
 
        return 0;
 
index 81c16b8..a73f2bf 100644 (file)
@@ -219,6 +219,7 @@ struct hclgevf_dev {
        enum hnae3_reset_type reset_level;
        unsigned long reset_pending;
        enum hnae3_reset_type reset_type;
+       struct timer_list reset_timer;
 
 #define HCLGEVF_RESET_REQUESTED                0
 #define HCLGEVF_RESET_PENDING          1
index bbf7b14..85c2a63 100644 (file)
@@ -63,6 +63,9 @@ static int hclgevf_get_mbx_resp(struct hclgevf_dev *hdev, u16 code0, u16 code1,
                i++;
        }
 
+       /* ensure additional_info will be seen after received_resp */
+       smp_rmb();
+
        if (i >= HCLGEVF_MAX_TRY_TIMES) {
                dev_err(&hdev->pdev->dev,
                        "VF could not get mbx(%u,%u) resp(=%d) from PF in %d tries\n",
@@ -178,6 +181,10 @@ static void hclgevf_handle_mbx_response(struct hclgevf_dev *hdev,
        resp->resp_status = hclgevf_resp_to_errno(resp_status);
        memcpy(resp->additional_info, req->msg.resp_data,
               HCLGE_MBX_MAX_RESP_DATA_SIZE * sizeof(u8));
+
+       /* ensure additional_info will be seen before setting received_resp */
+       smp_wmb();
+
        if (match_id) {
                /* If match_id is not zero, it means PF support match_id.
                 * if the match_id is right, VF get the right response, or
index cfb1580..8b7504a 100644 (file)
@@ -1479,14 +1479,14 @@ ice_post_dwnld_pkg_actions(struct ice_hw *hw)
 }
 
 /**
- * ice_download_pkg
+ * ice_download_pkg_with_sig_seg
  * @hw: pointer to the hardware structure
  * @pkg_hdr: pointer to package header
  *
  * Handles the download of a complete package.
  */
 static enum ice_ddp_state
-ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr)
+ice_download_pkg_with_sig_seg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr)
 {
        enum ice_aq_err aq_err = hw->adminq.sq_last_status;
        enum ice_ddp_state state = ICE_DDP_PKG_ERR;
@@ -1519,6 +1519,103 @@ ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr)
                state = ice_post_dwnld_pkg_actions(hw);
 
        ice_release_global_cfg_lock(hw);
+
+       return state;
+}
+
+/**
+ * ice_dwnld_cfg_bufs
+ * @hw: pointer to the hardware structure
+ * @bufs: pointer to an array of buffers
+ * @count: the number of buffers in the array
+ *
+ * Obtains global config lock and downloads the package configuration buffers
+ * to the firmware.
+ */
+static enum ice_ddp_state
+ice_dwnld_cfg_bufs(struct ice_hw *hw, struct ice_buf *bufs, u32 count)
+{
+       enum ice_ddp_state state;
+       struct ice_buf_hdr *bh;
+       int status;
+
+       if (!bufs || !count)
+               return ICE_DDP_PKG_ERR;
+
+       /* If the first buffer's first section has its metadata bit set
+        * then there are no buffers to be downloaded, and the operation is
+        * considered a success.
+        */
+       bh = (struct ice_buf_hdr *)bufs;
+       if (le32_to_cpu(bh->section_entry[0].type) & ICE_METADATA_BUF)
+               return ICE_DDP_PKG_SUCCESS;
+
+       status = ice_acquire_global_cfg_lock(hw, ICE_RES_WRITE);
+       if (status) {
+               if (status == -EALREADY)
+                       return ICE_DDP_PKG_ALREADY_LOADED;
+               return ice_map_aq_err_to_ddp_state(hw->adminq.sq_last_status);
+       }
+
+       state = ice_dwnld_cfg_bufs_no_lock(hw, bufs, 0, count, true);
+       if (!state)
+               state = ice_post_dwnld_pkg_actions(hw);
+
+       ice_release_global_cfg_lock(hw);
+
+       return state;
+}
+
+/**
+ * ice_download_pkg_without_sig_seg
+ * @hw: pointer to the hardware structure
+ * @ice_seg: pointer to the segment of the package to be downloaded
+ *
+ * Handles the download of a complete package without signature segment.
+ */
+static enum ice_ddp_state
+ice_download_pkg_without_sig_seg(struct ice_hw *hw, struct ice_seg *ice_seg)
+{
+       struct ice_buf_table *ice_buf_tbl;
+
+       ice_debug(hw, ICE_DBG_PKG, "Segment format version: %d.%d.%d.%d\n",
+                 ice_seg->hdr.seg_format_ver.major,
+                 ice_seg->hdr.seg_format_ver.minor,
+                 ice_seg->hdr.seg_format_ver.update,
+                 ice_seg->hdr.seg_format_ver.draft);
+
+       ice_debug(hw, ICE_DBG_PKG, "Seg: type 0x%X, size %d, name %s\n",
+                 le32_to_cpu(ice_seg->hdr.seg_type),
+                 le32_to_cpu(ice_seg->hdr.seg_size), ice_seg->hdr.seg_id);
+
+       ice_buf_tbl = ice_find_buf_table(ice_seg);
+
+       ice_debug(hw, ICE_DBG_PKG, "Seg buf count: %d\n",
+                 le32_to_cpu(ice_buf_tbl->buf_count));
+
+       return ice_dwnld_cfg_bufs(hw, ice_buf_tbl->buf_array,
+                                 le32_to_cpu(ice_buf_tbl->buf_count));
+}
+
+/**
+ * ice_download_pkg
+ * @hw: pointer to the hardware structure
+ * @pkg_hdr: pointer to package header
+ * @ice_seg: pointer to the segment of the package to be downloaded
+ *
+ * Handles the download of a complete package.
+ */
+static enum ice_ddp_state
+ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr,
+                struct ice_seg *ice_seg)
+{
+       enum ice_ddp_state state;
+
+       if (hw->pkg_has_signing_seg)
+               state = ice_download_pkg_with_sig_seg(hw, pkg_hdr);
+       else
+               state = ice_download_pkg_without_sig_seg(hw, ice_seg);
+
        ice_post_pkg_dwnld_vlan_mode_cfg(hw);
 
        return state;
@@ -2083,7 +2180,7 @@ enum ice_ddp_state ice_init_pkg(struct ice_hw *hw, u8 *buf, u32 len)
 
        /* initialize package hints and then download package */
        ice_init_pkg_hints(hw, seg);
-       state = ice_download_pkg(hw, pkg);
+       state = ice_download_pkg(hw, pkg, seg);
        if (state == ICE_DDP_PKG_ALREADY_LOADED) {
                ice_debug(hw, ICE_DBG_INIT,
                          "package previously loaded - no work.\n");
index 835c419..86b180c 100644 (file)
@@ -815,12 +815,6 @@ ice_dpll_input_prio_set(const struct dpll_pin *pin, void *pin_priv,
        struct ice_pf *pf = d->pf;
        int ret;
 
-       if (prio > ICE_DPLL_PRIO_MAX) {
-               NL_SET_ERR_MSG_FMT(extack, "prio out of supported range 0-%d",
-                                  ICE_DPLL_PRIO_MAX);
-               return -EINVAL;
-       }
-
        mutex_lock(&pf->dplls.lock);
        ret = ice_dpll_hw_input_prio_set(pf, d, p, prio, extack);
        mutex_unlock(&pf->dplls.lock);
@@ -1756,6 +1750,7 @@ ice_dpll_init_dpll(struct ice_pf *pf, struct ice_dpll *d, bool cgu,
        }
        d->pf = pf;
        if (cgu) {
+               ice_dpll_update_state(pf, d, true);
                ret = dpll_device_register(d->dpll, type, &ice_dpll_ops, d);
                if (ret) {
                        dpll_device_put(d->dpll);
@@ -1796,8 +1791,6 @@ static int ice_dpll_init_worker(struct ice_pf *pf)
        struct ice_dplls *d = &pf->dplls;
        struct kthread_worker *kworker;
 
-       ice_dpll_update_state(pf, &d->eec, true);
-       ice_dpll_update_state(pf, &d->pps, true);
        kthread_init_delayed_work(&d->work, ice_dpll_periodic_work);
        kworker = kthread_create_worker(0, "ice-dplls-%s",
                                        dev_name(ice_pf_to_dev(pf)));
@@ -1830,6 +1823,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
        int num_pins, i, ret = -EINVAL;
        struct ice_hw *hw = &pf->hw;
        struct ice_dpll_pin *pins;
+       unsigned long caps;
        u8 freq_supp_num;
        bool input;
 
@@ -1849,6 +1843,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
        }
 
        for (i = 0; i < num_pins; i++) {
+               caps = 0;
                pins[i].idx = i;
                pins[i].prop.board_label = ice_cgu_get_pin_name(hw, i, input);
                pins[i].prop.type = ice_cgu_get_pin_type(hw, i, input);
@@ -1861,8 +1856,8 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
                                                      &dp->input_prio[i]);
                        if (ret)
                                return ret;
-                       pins[i].prop.capabilities |=
-                               DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE;
+                       caps |= (DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE |
+                                DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE);
                        pins[i].prop.phase_range.min =
                                pf->dplls.input_phase_adj_max;
                        pins[i].prop.phase_range.max =
@@ -1872,9 +1867,11 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
                                pf->dplls.output_phase_adj_max;
                        pins[i].prop.phase_range.max =
                                -pf->dplls.output_phase_adj_max;
+                       ret = ice_cgu_get_output_pin_state_caps(hw, i, &caps);
+                       if (ret)
+                               return ret;
                }
-               pins[i].prop.capabilities |=
-                       DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE;
+               pins[i].prop.capabilities = caps;
                ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL);
                if (ret)
                        return ret;
index bb32b6d..93172e9 100644 (file)
@@ -6,7 +6,6 @@
 
 #include "ice.h"
 
-#define ICE_DPLL_PRIO_MAX      0xF
 #define ICE_DPLL_RCLK_NUM_MAX  4
 
 /** ice_dpll_pin - store info about pins
index 6d57390..a00b55e 100644 (file)
@@ -3961,3 +3961,57 @@ int ice_get_cgu_rclk_pin_info(struct ice_hw *hw, u8 *base_idx, u8 *pin_num)
 
        return ret;
 }
+
+/**
+ * ice_cgu_get_output_pin_state_caps - get output pin state capabilities
+ * @hw: pointer to the hw struct
+ * @pin_id: id of a pin
+ * @caps: capabilities to modify
+ *
+ * Return:
+ * * 0 - success, state capabilities were modified
+ * * negative - failure, capabilities were not modified
+ */
+int ice_cgu_get_output_pin_state_caps(struct ice_hw *hw, u8 pin_id,
+                                     unsigned long *caps)
+{
+       bool can_change = true;
+
+       switch (hw->device_id) {
+       case ICE_DEV_ID_E810C_SFP:
+               if (pin_id == ZL_OUT2 || pin_id == ZL_OUT3)
+                       can_change = false;
+               break;
+       case ICE_DEV_ID_E810C_QSFP:
+               if (pin_id == ZL_OUT2 || pin_id == ZL_OUT3 || pin_id == ZL_OUT4)
+                       can_change = false;
+               break;
+       case ICE_DEV_ID_E823L_10G_BASE_T:
+       case ICE_DEV_ID_E823L_1GBE:
+       case ICE_DEV_ID_E823L_BACKPLANE:
+       case ICE_DEV_ID_E823L_QSFP:
+       case ICE_DEV_ID_E823L_SFP:
+       case ICE_DEV_ID_E823C_10G_BASE_T:
+       case ICE_DEV_ID_E823C_BACKPLANE:
+       case ICE_DEV_ID_E823C_QSFP:
+       case ICE_DEV_ID_E823C_SFP:
+       case ICE_DEV_ID_E823C_SGMII:
+               if (hw->cgu_part_number ==
+                   ICE_AQC_GET_LINK_TOPO_NODE_NR_ZL30632_80032 &&
+                   pin_id == ZL_OUT2)
+                       can_change = false;
+               else if (hw->cgu_part_number ==
+                        ICE_AQC_GET_LINK_TOPO_NODE_NR_SI5383_5384 &&
+                        pin_id == SI_OUT1)
+                       can_change = false;
+               break;
+       default:
+               return -EINVAL;
+       }
+       if (can_change)
+               *caps |= DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE;
+       else
+               *caps &= ~DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE;
+
+       return 0;
+}
index 36aeeef..cf76701 100644 (file)
@@ -282,6 +282,8 @@ int ice_get_cgu_state(struct ice_hw *hw, u8 dpll_idx,
 int ice_get_cgu_rclk_pin_info(struct ice_hw *hw, u8 *base_idx, u8 *pin_num);
 
 void ice_ptp_init_phy_model(struct ice_hw *hw);
+int ice_cgu_get_output_pin_state_caps(struct ice_hw *hw, u8 pin_id,
+                                     unsigned long *caps);
 
 #define PFTSYN_SEM_BYTES       4
 
index 9081713..29aac32 100644 (file)
@@ -4790,14 +4790,17 @@ static void mvneta_ethtool_get_strings(struct net_device *netdev, u32 sset,
                                       u8 *data)
 {
        if (sset == ETH_SS_STATS) {
+               struct mvneta_port *pp = netdev_priv(netdev);
                int i;
 
                for (i = 0; i < ARRAY_SIZE(mvneta_statistics); i++)
                        memcpy(data + i * ETH_GSTRING_LEN,
                               mvneta_statistics[i].name, ETH_GSTRING_LEN);
 
-               data += ETH_GSTRING_LEN * ARRAY_SIZE(mvneta_statistics);
-               page_pool_ethtool_stats_get_strings(data);
+               if (!pp->bm_priv) {
+                       data += ETH_GSTRING_LEN * ARRAY_SIZE(mvneta_statistics);
+                       page_pool_ethtool_stats_get_strings(data);
+               }
        }
 }
 
@@ -4915,8 +4918,10 @@ static void mvneta_ethtool_pp_stats(struct mvneta_port *pp, u64 *data)
        struct page_pool_stats stats = {};
        int i;
 
-       for (i = 0; i < rxq_number; i++)
-               page_pool_get_stats(pp->rxqs[i].page_pool, &stats);
+       for (i = 0; i < rxq_number; i++) {
+               if (pp->rxqs[i].page_pool)
+                       page_pool_get_stats(pp->rxqs[i].page_pool, &stats);
+       }
 
        page_pool_ethtool_stats_get(data, &stats);
 }
@@ -4932,14 +4937,21 @@ static void mvneta_ethtool_get_stats(struct net_device *dev,
        for (i = 0; i < ARRAY_SIZE(mvneta_statistics); i++)
                *data++ = pp->ethtool_stats[i];
 
-       mvneta_ethtool_pp_stats(pp, data);
+       if (!pp->bm_priv)
+               mvneta_ethtool_pp_stats(pp, data);
 }
 
 static int mvneta_ethtool_get_sset_count(struct net_device *dev, int sset)
 {
-       if (sset == ETH_SS_STATS)
-               return ARRAY_SIZE(mvneta_statistics) +
-                      page_pool_ethtool_stats_get_count();
+       if (sset == ETH_SS_STATS) {
+               int count = ARRAY_SIZE(mvneta_statistics);
+               struct mvneta_port *pp = netdev_priv(dev);
+
+               if (!pp->bm_priv)
+                       count += page_pool_ethtool_stats_get_count();
+
+               return count;
+       }
 
        return -EOPNOTSUPP;
 }
index bb11e64..af3928e 100644 (file)
@@ -177,6 +177,8 @@ static void mlx5e_ptpsq_mark_ts_cqes_undelivered(struct mlx5e_ptpsq *ptpsq,
 
 static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
                                    struct mlx5_cqe64 *cqe,
+                                   u8 *md_buff,
+                                   u8 *md_buff_sz,
                                    int budget)
 {
        struct mlx5e_ptp_port_ts_cqe_list *pending_cqe_list = ptpsq->ts_cqe_pending_list;
@@ -211,19 +213,24 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
        mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp);
 out:
        napi_consume_skb(skb, budget);
-       mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist, metadata_id);
+       md_buff[*md_buff_sz++] = metadata_id;
        if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) &&
            !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
                queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work);
 }
 
-static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget)
+static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int napi_budget)
 {
        struct mlx5e_ptpsq *ptpsq = container_of(cq, struct mlx5e_ptpsq, ts_cq);
-       struct mlx5_cqwq *cqwq = &cq->wq;
+       int budget = min(napi_budget, MLX5E_TX_CQ_POLL_BUDGET);
+       u8 metadata_buff[MLX5E_TX_CQ_POLL_BUDGET];
+       u8 metadata_buff_sz = 0;
+       struct mlx5_cqwq *cqwq;
        struct mlx5_cqe64 *cqe;
        int work_done = 0;
 
+       cqwq = &cq->wq;
+
        if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state)))
                return false;
 
@@ -234,7 +241,8 @@ static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget)
        do {
                mlx5_cqwq_pop(cqwq);
 
-               mlx5e_ptp_handle_ts_cqe(ptpsq, cqe, budget);
+               mlx5e_ptp_handle_ts_cqe(ptpsq, cqe,
+                                       metadata_buff, &metadata_buff_sz, napi_budget);
        } while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(cqwq)));
 
        mlx5_cqwq_update_db_record(cqwq);
@@ -242,6 +250,10 @@ static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget)
        /* ensure cq space is freed before enabling more cqes */
        wmb();
 
+       while (metadata_buff_sz > 0)
+               mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist,
+                                            metadata_buff[--metadata_buff_sz]);
+
        mlx5e_txqsq_wake(&ptpsq->txqsq);
 
        return work_done == budget;
index fea8c0a..4358798 100644 (file)
@@ -492,11 +492,11 @@ static int mlx5e_rx_reporter_dump(struct devlink_health_reporter *reporter,
 
 void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq)
 {
-       char icosq_str[MLX5E_REPORTER_PER_Q_MAX_LEN] = {};
        char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
        struct mlx5e_icosq *icosq = rq->icosq;
        struct mlx5e_priv *priv = rq->priv;
        struct mlx5e_err_ctx err_ctx = {};
+       char icosq_str[32] = {};
 
        err_ctx.ctx = rq;
        err_ctx.recover = mlx5e_rx_reporter_timeout_recover;
@@ -505,7 +505,7 @@ void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq)
        if (icosq)
                snprintf(icosq_str, sizeof(icosq_str), "ICOSQ: 0x%x, ", icosq->sqn);
        snprintf(err_str, sizeof(err_str),
-                "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x",
+                "RX timeout on channel: %d, %s RQ: 0x%x, CQ: 0x%x",
                 rq->ix, icosq_str, rq->rqn, rq->cq.mcq.cqn);
 
        mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
index 00a04fd..668da5c 100644 (file)
@@ -300,9 +300,6 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv,
        if (err)
                goto destroy_neigh_entry;
 
-       e->encap_size = ipv4_encap_size;
-       e->encap_header = encap_header;
-
        if (!(nud_state & NUD_VALID)) {
                neigh_event_send(attr.n, NULL);
                /* the encap entry will be made valid on neigh update event
@@ -322,6 +319,8 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv,
                goto destroy_neigh_entry;
        }
 
+       e->encap_size = ipv4_encap_size;
+       e->encap_header = encap_header;
        e->flags |= MLX5_ENCAP_ENTRY_VALID;
        mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev));
        mlx5e_route_lookup_ipv4_put(&attr);
@@ -404,16 +403,12 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv,
        if (err)
                goto free_encap;
 
-       e->encap_size = ipv4_encap_size;
-       kfree(e->encap_header);
-       e->encap_header = encap_header;
-
        if (!(nud_state & NUD_VALID)) {
                neigh_event_send(attr.n, NULL);
                /* the encap entry will be made valid on neigh update event
                 * and not used before that.
                 */
-               goto release_neigh;
+               goto free_encap;
        }
 
        memset(&reformat_params, 0, sizeof(reformat_params));
@@ -427,6 +422,10 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv,
                goto free_encap;
        }
 
+       e->encap_size = ipv4_encap_size;
+       kfree(e->encap_header);
+       e->encap_header = encap_header;
+
        e->flags |= MLX5_ENCAP_ENTRY_VALID;
        mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev));
        mlx5e_route_lookup_ipv4_put(&attr);
@@ -568,9 +567,6 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
        if (err)
                goto destroy_neigh_entry;
 
-       e->encap_size = ipv6_encap_size;
-       e->encap_header = encap_header;
-
        if (!(nud_state & NUD_VALID)) {
                neigh_event_send(attr.n, NULL);
                /* the encap entry will be made valid on neigh update event
@@ -590,6 +586,8 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
                goto destroy_neigh_entry;
        }
 
+       e->encap_size = ipv6_encap_size;
+       e->encap_header = encap_header;
        e->flags |= MLX5_ENCAP_ENTRY_VALID;
        mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev));
        mlx5e_route_lookup_ipv6_put(&attr);
@@ -671,16 +669,12 @@ int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv,
        if (err)
                goto free_encap;
 
-       e->encap_size = ipv6_encap_size;
-       kfree(e->encap_header);
-       e->encap_header = encap_header;
-
        if (!(nud_state & NUD_VALID)) {
                neigh_event_send(attr.n, NULL);
                /* the encap entry will be made valid on neigh update event
                 * and not used before that.
                 */
-               goto release_neigh;
+               goto free_encap;
        }
 
        memset(&reformat_params, 0, sizeof(reformat_params));
@@ -694,6 +688,10 @@ int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv,
                goto free_encap;
        }
 
+       e->encap_size = ipv6_encap_size;
+       kfree(e->encap_header);
+       e->encap_header = encap_header;
+
        e->flags |= MLX5_ENCAP_ENTRY_VALID;
        mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev));
        mlx5e_route_lookup_ipv6_put(&attr);
index 215261a..792a0ea 100644 (file)
@@ -43,12 +43,17 @@ void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
                               struct ethtool_drvinfo *drvinfo)
 {
        struct mlx5_core_dev *mdev = priv->mdev;
+       int count;
 
        strscpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver));
-       snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
-                "%d.%d.%04d (%.16s)",
-                fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev),
-                mdev->board_id);
+       count = snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+                        "%d.%d.%04d (%.16s)", fw_rev_maj(mdev),
+                        fw_rev_min(mdev), fw_rev_sub(mdev), mdev->board_id);
+       if (count == sizeof(drvinfo->fw_version))
+               snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+                        "%d.%d.%04d", fw_rev_maj(mdev),
+                        fw_rev_min(mdev), fw_rev_sub(mdev));
+
        strscpy(drvinfo->bus_info, dev_name(mdev->device),
                sizeof(drvinfo->bus_info));
 }
index 693e55b..3ab682b 100644 (file)
@@ -71,13 +71,17 @@ static void mlx5e_rep_get_drvinfo(struct net_device *dev,
 {
        struct mlx5e_priv *priv = netdev_priv(dev);
        struct mlx5_core_dev *mdev = priv->mdev;
+       int count;
 
        strscpy(drvinfo->driver, mlx5e_rep_driver_name,
                sizeof(drvinfo->driver));
-       snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
-                "%d.%d.%04d (%.16s)",
-                fw_rev_maj(mdev), fw_rev_min(mdev),
-                fw_rev_sub(mdev), mdev->board_id);
+       count = snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+                        "%d.%d.%04d (%.16s)", fw_rev_maj(mdev),
+                        fw_rev_min(mdev), fw_rev_sub(mdev), mdev->board_id);
+       if (count == sizeof(drvinfo->fw_version))
+               snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+                        "%d.%d.%04d", fw_rev_maj(mdev),
+                        fw_rev_min(mdev), fw_rev_sub(mdev));
 }
 
 static const struct counter_desc sw_rep_stats_desc[] = {
index 9a5a5c2..7ca9e5b 100644 (file)
@@ -3147,7 +3147,7 @@ static struct mlx5_fields fields[] = {
        OFFLOAD(DIPV6_31_0,   32, U32_MAX, ip6.daddr.s6_addr32[3], 0,
                dst_ipv4_dst_ipv6.ipv6_layout.ipv6[12]),
        OFFLOAD(IPV6_HOPLIMIT, 8,  U8_MAX, ip6.hop_limit, 0, ttl_hoplimit),
-       OFFLOAD(IP_DSCP, 16,  0xc00f, ip6, 0, ip_dscp),
+       OFFLOAD(IP_DSCP, 16,  0x0fc0, ip6, 0, ip_dscp),
 
        OFFLOAD(TCP_SPORT, 16, U16_MAX, tcp.source,  0, tcp_sport),
        OFFLOAD(TCP_DPORT, 16, U16_MAX, tcp.dest,    0, tcp_dport),
@@ -3158,21 +3158,31 @@ static struct mlx5_fields fields[] = {
        OFFLOAD(UDP_DPORT, 16, U16_MAX, udp.dest,   0, udp_dport),
 };
 
-static unsigned long mask_to_le(unsigned long mask, int size)
+static u32 mask_field_get(void *mask, struct mlx5_fields *f)
 {
-       __be32 mask_be32;
-       __be16 mask_be16;
-
-       if (size == 32) {
-               mask_be32 = (__force __be32)(mask);
-               mask = (__force unsigned long)cpu_to_le32(be32_to_cpu(mask_be32));
-       } else if (size == 16) {
-               mask_be32 = (__force __be32)(mask);
-               mask_be16 = *(__be16 *)&mask_be32;
-               mask = (__force unsigned long)cpu_to_le16(be16_to_cpu(mask_be16));
+       switch (f->field_bsize) {
+       case 32:
+               return be32_to_cpu(*(__be32 *)mask) & f->field_mask;
+       case 16:
+               return be16_to_cpu(*(__be16 *)mask) & (u16)f->field_mask;
+       default:
+               return *(u8 *)mask & (u8)f->field_mask;
        }
+}
 
-       return mask;
+static void mask_field_clear(void *mask, struct mlx5_fields *f)
+{
+       switch (f->field_bsize) {
+       case 32:
+               *(__be32 *)mask &= ~cpu_to_be32(f->field_mask);
+               break;
+       case 16:
+               *(__be16 *)mask &= ~cpu_to_be16((u16)f->field_mask);
+               break;
+       default:
+               *(u8 *)mask &= ~(u8)f->field_mask;
+               break;
+       }
 }
 
 static int offload_pedit_fields(struct mlx5e_priv *priv,
@@ -3184,11 +3194,12 @@ static int offload_pedit_fields(struct mlx5e_priv *priv,
        struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals;
        struct pedit_headers_action *hdrs = parse_attr->hdrs;
        void *headers_c, *headers_v, *action, *vals_p;
-       u32 *s_masks_p, *a_masks_p, s_mask, a_mask;
        struct mlx5e_tc_mod_hdr_acts *mod_acts;
-       unsigned long mask, field_mask;
+       void *s_masks_p, *a_masks_p;
        int i, first, last, next_z;
        struct mlx5_fields *f;
+       unsigned long mask;
+       u32 s_mask, a_mask;
        u8 cmd;
 
        mod_acts = &parse_attr->mod_hdr_acts;
@@ -3204,15 +3215,11 @@ static int offload_pedit_fields(struct mlx5e_priv *priv,
                bool skip;
 
                f = &fields[i];
-               /* avoid seeing bits set from previous iterations */
-               s_mask = 0;
-               a_mask = 0;
-
                s_masks_p = (void *)set_masks + f->offset;
                a_masks_p = (void *)add_masks + f->offset;
 
-               s_mask = *s_masks_p & f->field_mask;
-               a_mask = *a_masks_p & f->field_mask;
+               s_mask = mask_field_get(s_masks_p, f);
+               a_mask = mask_field_get(a_masks_p, f);
 
                if (!s_mask && !a_mask) /* nothing to offload here */
                        continue;
@@ -3239,22 +3246,20 @@ static int offload_pedit_fields(struct mlx5e_priv *priv,
                                         match_mask, f->field_bsize))
                                skip = true;
                        /* clear to denote we consumed this field */
-                       *s_masks_p &= ~f->field_mask;
+                       mask_field_clear(s_masks_p, f);
                } else {
                        cmd  = MLX5_ACTION_TYPE_ADD;
                        mask = a_mask;
                        vals_p = (void *)add_vals + f->offset;
                        /* add 0 is no change */
-                       if ((*(u32 *)vals_p & f->field_mask) == 0)
+                       if (!mask_field_get(vals_p, f))
                                skip = true;
                        /* clear to denote we consumed this field */
-                       *a_masks_p &= ~f->field_mask;
+                       mask_field_clear(a_masks_p, f);
                }
                if (skip)
                        continue;
 
-               mask = mask_to_le(mask, f->field_bsize);
-
                first = find_first_bit(&mask, f->field_bsize);
                next_z = find_next_zero_bit(&mask, f->field_bsize, first);
                last  = find_last_bit(&mask, f->field_bsize);
@@ -3281,10 +3286,9 @@ static int offload_pedit_fields(struct mlx5e_priv *priv,
                MLX5_SET(set_action_in, action, field, f->field);
 
                if (cmd == MLX5_ACTION_TYPE_SET) {
+                       unsigned long field_mask = f->field_mask;
                        int start;
 
-                       field_mask = mask_to_le(f->field_mask, f->field_bsize);
-
                        /* if field is bit sized it can start not from first bit */
                        start = find_first_bit(&field_mask, f->field_bsize);
 
index d41435c..f0b506e 100644 (file)
@@ -399,9 +399,9 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
                u8 metadata_index = be32_to_cpu(eseg->flow_table_metadata);
 
                mlx5e_skb_cb_hwtstamp_init(skb);
-               mlx5e_ptpsq_track_metadata(sq->ptpsq, metadata_index);
                mlx5e_ptp_metadata_map_put(&sq->ptpsq->metadata_map, skb,
                                           metadata_index);
+               mlx5e_ptpsq_track_metadata(sq->ptpsq, metadata_index);
                if (!netif_tx_queue_stopped(sq->txq) &&
                    mlx5e_ptpsq_metadata_freelist_empty(sq->ptpsq)) {
                        netif_tx_stop_queue(sq->txq);
@@ -494,10 +494,10 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 
 err_drop:
        stats->dropped++;
-       dev_kfree_skb_any(skb);
        if (unlikely(sq->ptpsq && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
                mlx5e_ptp_metadata_fifo_push(&sq->ptpsq->metadata_freelist,
                                             be32_to_cpu(eseg->flow_table_metadata));
+       dev_kfree_skb_any(skb);
        mlx5e_tx_flush(sq);
 }
 
index ea0405e..40a6cb0 100644 (file)
@@ -885,11 +885,14 @@ static void comp_irq_release_sf(struct mlx5_core_dev *dev, u16 vecidx)
 {
        struct mlx5_eq_table *table = dev->priv.eq_table;
        struct mlx5_irq *irq;
+       int cpu;
 
        irq = xa_load(&table->comp_irqs, vecidx);
        if (!irq)
                return;
 
+       cpu = cpumask_first(mlx5_irq_get_affinity_mask(irq));
+       cpumask_clear_cpu(cpu, &table->used_cpus);
        xa_erase(&table->comp_irqs, vecidx);
        mlx5_irq_affinity_irq_release(dev, irq);
 }
@@ -897,16 +900,26 @@ static void comp_irq_release_sf(struct mlx5_core_dev *dev, u16 vecidx)
 static int comp_irq_request_sf(struct mlx5_core_dev *dev, u16 vecidx)
 {
        struct mlx5_eq_table *table = dev->priv.eq_table;
+       struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev);
+       struct irq_affinity_desc af_desc = {};
        struct mlx5_irq *irq;
 
-       irq = mlx5_irq_affinity_irq_request_auto(dev, &table->used_cpus, vecidx);
-       if (IS_ERR(irq)) {
-               /* In case SF irq pool does not exist, fallback to the PF irqs*/
-               if (PTR_ERR(irq) == -ENOENT)
-                       return comp_irq_request_pci(dev, vecidx);
+       /* In case SF irq pool does not exist, fallback to the PF irqs*/
+       if (!mlx5_irq_pool_is_sf_pool(pool))
+               return comp_irq_request_pci(dev, vecidx);
 
+       af_desc.is_managed = 1;
+       cpumask_copy(&af_desc.mask, cpu_online_mask);
+       cpumask_andnot(&af_desc.mask, &af_desc.mask, &table->used_cpus);
+       irq = mlx5_irq_affinity_request(pool, &af_desc);
+       if (IS_ERR(irq))
                return PTR_ERR(irq);
-       }
+
+       cpumask_or(&table->used_cpus, &table->used_cpus, mlx5_irq_get_affinity_mask(irq));
+       mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n",
+                     pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)),
+                     cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)),
+                     mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ);
 
        return xa_err(xa_store(&table->comp_irqs, vecidx, irq, GFP_KERNEL));
 }
index b296ac5..88236e7 100644 (file)
@@ -984,7 +984,8 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw,
        dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID;
        flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 
-       if (rep->vport == MLX5_VPORT_UPLINK && on_esw->offloads.ft_ipsec_tx_pol) {
+       if (rep->vport == MLX5_VPORT_UPLINK &&
+           on_esw == from_esw && on_esw->offloads.ft_ipsec_tx_pol) {
                dest.ft = on_esw->offloads.ft_ipsec_tx_pol;
                flow_act.flags = FLOW_ACT_IGNORE_FLOW_LEVEL;
                dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
index 047d5fe..612e666 100644 (file)
@@ -168,45 +168,3 @@ void mlx5_irq_affinity_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *i
                if (pool->irqs_per_cpu)
                        cpu_put(pool, cpu);
 }
-
-/**
- * mlx5_irq_affinity_irq_request_auto - request one IRQ for mlx5 device.
- * @dev: mlx5 device that is requesting the IRQ.
- * @used_cpus: cpumask of bounded cpus by the device
- * @vecidx: vector index to request an IRQ for.
- *
- * Each IRQ is bounded to at most 1 CPU.
- * This function is requesting an IRQ according to the default assignment.
- * The default assignment policy is:
- * - request the least loaded IRQ which is not bound to any
- *   CPU of the previous IRQs requested.
- *
- * On success, this function updates used_cpus mask and returns an irq pointer.
- * In case of an error, an appropriate error pointer is returned.
- */
-struct mlx5_irq *mlx5_irq_affinity_irq_request_auto(struct mlx5_core_dev *dev,
-                                                   struct cpumask *used_cpus, u16 vecidx)
-{
-       struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev);
-       struct irq_affinity_desc af_desc = {};
-       struct mlx5_irq *irq;
-
-       if (!mlx5_irq_pool_is_sf_pool(pool))
-               return ERR_PTR(-ENOENT);
-
-       af_desc.is_managed = 1;
-       cpumask_copy(&af_desc.mask, cpu_online_mask);
-       cpumask_andnot(&af_desc.mask, &af_desc.mask, used_cpus);
-       irq = mlx5_irq_affinity_request(pool, &af_desc);
-
-       if (IS_ERR(irq))
-               return irq;
-
-       cpumask_or(used_cpus, used_cpus, mlx5_irq_get_affinity_mask(irq));
-       mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n",
-                     pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)),
-                     cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)),
-                     mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ);
-
-       return irq;
-}
index aa29f09..0c83ef1 100644 (file)
@@ -384,7 +384,12 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 
 static int mlx5_ptp_adjphase(struct ptp_clock_info *ptp, s32 delta)
 {
-       return mlx5_ptp_adjtime(ptp, delta);
+       struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info);
+       struct mlx5_core_dev *mdev;
+
+       mdev = container_of(clock, struct mlx5_core_dev, clock);
+
+       return mlx5_ptp_adjtime_real_time(mdev, delta);
 }
 
 static int mlx5_ptp_freq_adj_real_time(struct mlx5_core_dev *mdev, long scaled_ppm)
index 6536482..4dcf995 100644 (file)
@@ -28,7 +28,7 @@
 struct mlx5_irq {
        struct atomic_notifier_head nh;
        cpumask_var_t mask;
-       char name[MLX5_MAX_IRQ_NAME];
+       char name[MLX5_MAX_IRQ_FORMATTED_NAME];
        struct mlx5_irq_pool *pool;
        int refcount;
        struct msi_map map;
@@ -292,8 +292,8 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
        else
                irq_sf_set_name(pool, name, i);
        ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh);
-       snprintf(irq->name, MLX5_MAX_IRQ_NAME,
-                "%s@pci:%s", name, pci_name(dev->pdev));
+       snprintf(irq->name, MLX5_MAX_IRQ_FORMATTED_NAME,
+                MLX5_IRQ_NAME_FORMAT_STR, name, pci_name(dev->pdev));
        err = request_irq(irq->map.virq, irq_int_handler, 0, irq->name,
                          &irq->nh);
        if (err) {
index d3a77a0..c4d377f 100644 (file)
@@ -7,6 +7,9 @@
 #include <linux/mlx5/driver.h>
 
 #define MLX5_MAX_IRQ_NAME (32)
+#define MLX5_IRQ_NAME_FORMAT_STR ("%s@pci:%s")
+#define MLX5_MAX_IRQ_FORMATTED_NAME \
+       (MLX5_MAX_IRQ_NAME + sizeof(MLX5_IRQ_NAME_FORMAT_STR))
 /* max irq_index is 2047, so four chars */
 #define MLX5_MAX_IRQ_IDX_CHARS (4)
 #define MLX5_EQ_REFS_PER_IRQ (2)
index 6ea88a5..e3ec559 100644 (file)
@@ -57,7 +57,8 @@ static const char *dr_action_id_to_str(enum mlx5dr_action_type action_id)
 
 static bool mlx5dr_action_supp_fwd_fdb_multi_ft(struct mlx5_core_dev *dev)
 {
-       return (MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table_limit_regc) ||
+       return (MLX5_CAP_GEN(dev, steering_format_version) < MLX5_STEERING_FORMAT_CONNECTX_6DX ||
+               MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table_limit_regc) ||
                MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table));
 }
 
index 4e8527a..6fa06ba 100644 (file)
@@ -52,7 +52,6 @@ struct dr_qp_init_attr {
        u32 cqn;
        u32 pdn;
        u32 max_send_wr;
-       u32 max_send_sge;
        struct mlx5_uars_page *uar;
        u8 isolate_vl_tc:1;
 };
@@ -247,37 +246,6 @@ static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne)
        return err == CQ_POLL_ERR ? err : npolled;
 }
 
-static int dr_qp_get_args_update_send_wqe_size(struct dr_qp_init_attr *attr)
-{
-       return roundup_pow_of_two(sizeof(struct mlx5_wqe_ctrl_seg) +
-                                 sizeof(struct mlx5_wqe_flow_update_ctrl_seg) +
-                                 sizeof(struct mlx5_wqe_header_modify_argument_update_seg));
-}
-
-/* We calculate for specific RC QP with the required functionality */
-static int dr_qp_calc_rc_send_wqe(struct dr_qp_init_attr *attr)
-{
-       int update_arg_size;
-       int inl_size = 0;
-       int tot_size;
-       int size;
-
-       update_arg_size = dr_qp_get_args_update_send_wqe_size(attr);
-
-       size = sizeof(struct mlx5_wqe_ctrl_seg) +
-              sizeof(struct mlx5_wqe_raddr_seg);
-       inl_size = size + ALIGN(sizeof(struct mlx5_wqe_inline_seg) +
-                               DR_STE_SIZE, 16);
-
-       size += attr->max_send_sge * sizeof(struct mlx5_wqe_data_seg);
-
-       size = max(size, update_arg_size);
-
-       tot_size = max(size, inl_size);
-
-       return ALIGN(tot_size, MLX5_SEND_WQE_BB);
-}
-
 static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
                                         struct dr_qp_init_attr *attr)
 {
@@ -285,7 +253,6 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
        u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {};
        struct mlx5_wq_param wqp;
        struct mlx5dr_qp *dr_qp;
-       int wqe_size;
        int inlen;
        void *qpc;
        void *in;
@@ -365,15 +332,6 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
        if (err)
                goto err_in;
        dr_qp->uar = attr->uar;
-       wqe_size = dr_qp_calc_rc_send_wqe(attr);
-       dr_qp->max_inline_data = min(wqe_size -
-                                    (sizeof(struct mlx5_wqe_ctrl_seg) +
-                                     sizeof(struct mlx5_wqe_raddr_seg) +
-                                     sizeof(struct mlx5_wqe_inline_seg)),
-                                    (2 * MLX5_SEND_WQE_BB -
-                                     (sizeof(struct mlx5_wqe_ctrl_seg) +
-                                      sizeof(struct mlx5_wqe_raddr_seg) +
-                                      sizeof(struct mlx5_wqe_inline_seg))));
 
        return dr_qp;
 
@@ -437,48 +395,8 @@ dr_rdma_handle_flow_access_arg_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl,
                MLX5_SEND_WQE_DS;
 }
 
-static int dr_set_data_inl_seg(struct mlx5dr_qp *dr_qp,
-                              struct dr_data_seg *data_seg, void *wqe)
-{
-       int inline_header_size = sizeof(struct mlx5_wqe_ctrl_seg) +
-                               sizeof(struct mlx5_wqe_raddr_seg) +
-                               sizeof(struct mlx5_wqe_inline_seg);
-       struct mlx5_wqe_inline_seg *seg;
-       int left_space;
-       int inl = 0;
-       void *addr;
-       int len;
-       int idx;
-
-       seg = wqe;
-       wqe += sizeof(*seg);
-       addr = (void *)(unsigned long)(data_seg->addr);
-       len  = data_seg->length;
-       inl += len;
-       left_space = MLX5_SEND_WQE_BB - inline_header_size;
-
-       if (likely(len > left_space)) {
-               memcpy(wqe, addr, left_space);
-               len -= left_space;
-               addr += left_space;
-               idx = (dr_qp->sq.pc + 1) & (dr_qp->sq.wqe_cnt - 1);
-               wqe = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx);
-       }
-
-       memcpy(wqe, addr, len);
-
-       if (likely(inl)) {
-               seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
-               return DIV_ROUND_UP(inl + sizeof(seg->byte_count),
-                                   MLX5_SEND_WQE_DS);
-       } else {
-               return 0;
-       }
-}
-
 static void
-dr_rdma_handle_icm_write_segments(struct mlx5dr_qp *dr_qp,
-                                 struct mlx5_wqe_ctrl_seg *wq_ctrl,
+dr_rdma_handle_icm_write_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl,
                                  u64 remote_addr,
                                  u32 rkey,
                                  struct dr_data_seg *data_seg,
@@ -494,17 +412,15 @@ dr_rdma_handle_icm_write_segments(struct mlx5dr_qp *dr_qp,
        wq_raddr->reserved = 0;
 
        wq_dseg = (void *)(wq_raddr + 1);
-       /* WQE ctrl segment + WQE remote addr segment */
-       *size = (sizeof(*wq_ctrl) + sizeof(*wq_raddr)) / MLX5_SEND_WQE_DS;
 
-       if (data_seg->send_flags & IB_SEND_INLINE) {
-               *size += dr_set_data_inl_seg(dr_qp, data_seg, wq_dseg);
-       } else {
-               wq_dseg->byte_count = cpu_to_be32(data_seg->length);
-               wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
-               wq_dseg->addr = cpu_to_be64(data_seg->addr);
-               *size += sizeof(*wq_dseg) / MLX5_SEND_WQE_DS;  /* WQE data segment */
-       }
+       wq_dseg->byte_count = cpu_to_be32(data_seg->length);
+       wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
+       wq_dseg->addr = cpu_to_be64(data_seg->addr);
+
+       *size = (sizeof(*wq_ctrl) +    /* WQE ctrl segment */
+                sizeof(*wq_dseg) +    /* WQE data segment */
+                sizeof(*wq_raddr)) /  /* WQE remote addr segment */
+               MLX5_SEND_WQE_DS;
 }
 
 static void dr_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *wq_ctrl,
@@ -535,7 +451,7 @@ static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr,
        switch (opcode) {
        case MLX5_OPCODE_RDMA_READ:
        case MLX5_OPCODE_RDMA_WRITE:
-               dr_rdma_handle_icm_write_segments(dr_qp, wq_ctrl, remote_addr,
+               dr_rdma_handle_icm_write_segments(wq_ctrl, remote_addr,
                                                  rkey, data_seg, &size);
                break;
        case MLX5_OPCODE_FLOW_TBL_ACCESS:
@@ -656,7 +572,7 @@ static void dr_fill_write_args_segs(struct mlx5dr_send_ring *send_ring,
        if (send_ring->pending_wqe % send_ring->signal_th == 0)
                send_info->write.send_flags |= IB_SEND_SIGNALED;
        else
-               send_info->write.send_flags &= ~IB_SEND_SIGNALED;
+               send_info->write.send_flags = 0;
 }
 
 static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn,
@@ -680,13 +596,9 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn,
        }
 
        send_ring->pending_wqe++;
-       if (!send_info->write.lkey)
-               send_info->write.send_flags |= IB_SEND_INLINE;
 
        if (send_ring->pending_wqe % send_ring->signal_th == 0)
                send_info->write.send_flags |= IB_SEND_SIGNALED;
-       else
-               send_info->write.send_flags &= ~IB_SEND_SIGNALED;
 
        send_ring->pending_wqe++;
        send_info->read.length = send_info->write.length;
@@ -696,9 +608,9 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn,
        send_info->read.lkey = send_ring->sync_mr->mkey;
 
        if (send_ring->pending_wqe % send_ring->signal_th == 0)
-               send_info->read.send_flags |= IB_SEND_SIGNALED;
+               send_info->read.send_flags = IB_SEND_SIGNALED;
        else
-               send_info->read.send_flags &= ~IB_SEND_SIGNALED;
+               send_info->read.send_flags = 0;
 }
 
 static void dr_fill_data_segs(struct mlx5dr_domain *dmn,
@@ -1345,7 +1257,6 @@ int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
        dmn->send_ring->cq->qp = dmn->send_ring->qp;
 
        dmn->info.max_send_wr = QUEUE_SIZE;
-       init_attr.max_send_sge = 1;
        dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data,
                                        DR_STE_SIZE);
 
index 0c76c16..b9bb1d2 100644 (file)
@@ -624,6 +624,7 @@ struct rtl8169_private {
 
        unsigned supports_gmii:1;
        unsigned aspm_manageable:1;
+       unsigned dash_enabled:1;
        dma_addr_t counters_phys_addr;
        struct rtl8169_counters *counters;
        struct rtl8169_tc_offsets tc_offset;
@@ -1253,14 +1254,26 @@ static bool r8168ep_check_dash(struct rtl8169_private *tp)
        return r8168ep_ocp_read(tp, 0x128) & BIT(0);
 }
 
-static enum rtl_dash_type rtl_check_dash(struct rtl8169_private *tp)
+static bool rtl_dash_is_enabled(struct rtl8169_private *tp)
+{
+       switch (tp->dash_type) {
+       case RTL_DASH_DP:
+               return r8168dp_check_dash(tp);
+       case RTL_DASH_EP:
+               return r8168ep_check_dash(tp);
+       default:
+               return false;
+       }
+}
+
+static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private *tp)
 {
        switch (tp->mac_version) {
        case RTL_GIGA_MAC_VER_28:
        case RTL_GIGA_MAC_VER_31:
-               return r8168dp_check_dash(tp) ? RTL_DASH_DP : RTL_DASH_NONE;
+               return RTL_DASH_DP;
        case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_53:
-               return r8168ep_check_dash(tp) ? RTL_DASH_EP : RTL_DASH_NONE;
+               return RTL_DASH_EP;
        default:
                return RTL_DASH_NONE;
        }
@@ -1453,7 +1466,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts)
 
        device_set_wakeup_enable(tp_to_dev(tp), wolopts);
 
-       if (tp->dash_type == RTL_DASH_NONE) {
+       if (!tp->dash_enabled) {
                rtl_set_d3_pll_down(tp, !wolopts);
                tp->dev->wol_enabled = wolopts ? 1 : 0;
        }
@@ -2512,7 +2525,7 @@ static void rtl_wol_enable_rx(struct rtl8169_private *tp)
 
 static void rtl_prepare_power_down(struct rtl8169_private *tp)
 {
-       if (tp->dash_type != RTL_DASH_NONE)
+       if (tp->dash_enabled)
                return;
 
        if (tp->mac_version == RTL_GIGA_MAC_VER_32 ||
@@ -4648,10 +4661,16 @@ static void rtl8169_down(struct rtl8169_private *tp)
        rtl8169_cleanup(tp);
        rtl_disable_exit_l1(tp);
        rtl_prepare_power_down(tp);
+
+       if (tp->dash_type != RTL_DASH_NONE)
+               rtl8168_driver_stop(tp);
 }
 
 static void rtl8169_up(struct rtl8169_private *tp)
 {
+       if (tp->dash_type != RTL_DASH_NONE)
+               rtl8168_driver_start(tp);
+
        pci_set_master(tp->pci_dev);
        phy_init_hw(tp->phydev);
        phy_resume(tp->phydev);
@@ -4869,7 +4888,7 @@ static int rtl8169_runtime_idle(struct device *device)
 {
        struct rtl8169_private *tp = dev_get_drvdata(device);
 
-       if (tp->dash_type != RTL_DASH_NONE)
+       if (tp->dash_enabled)
                return -EBUSY;
 
        if (!netif_running(tp->dev) || !netif_carrier_ok(tp->dev))
@@ -4895,8 +4914,7 @@ static void rtl_shutdown(struct pci_dev *pdev)
        /* Restore original MAC address */
        rtl_rar_set(tp, tp->dev->perm_addr);
 
-       if (system_state == SYSTEM_POWER_OFF &&
-           tp->dash_type == RTL_DASH_NONE) {
+       if (system_state == SYSTEM_POWER_OFF && !tp->dash_enabled) {
                pci_wake_from_d3(pdev, tp->saved_wolopts);
                pci_set_power_state(pdev, PCI_D3hot);
        }
@@ -5254,7 +5272,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
                rc = pci_disable_link_state(pdev, PCIE_LINK_STATE_L1);
        tp->aspm_manageable = !rc;
 
-       tp->dash_type = rtl_check_dash(tp);
+       tp->dash_type = rtl_get_dash_type(tp);
+       tp->dash_enabled = rtl_dash_is_enabled(tp);
 
        tp->cp_cmd = RTL_R16(tp, CPlusCmd) & CPCMD_MASK;
 
@@ -5325,7 +5344,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
        /* configure chip for default features */
        rtl8169_set_features(dev, dev->features);
 
-       if (tp->dash_type == RTL_DASH_NONE) {
+       if (!tp->dash_enabled) {
                rtl_set_d3_pll_down(tp, true);
        } else {
                rtl_set_d3_pll_down(tp, false);
@@ -5365,7 +5384,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
                            "ok" : "ko");
 
        if (tp->dash_type != RTL_DASH_NONE) {
-               netdev_info(dev, "DASH enabled\n");
+               netdev_info(dev, "DASH %s\n",
+                           tp->dash_enabled ? "enabled" : "disabled");
                rtl8168_driver_start(tp);
        }
 
index 3e50fd5..2afb2bd 100644 (file)
@@ -5293,6 +5293,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 
        dma_dir = page_pool_get_dma_dir(rx_q->page_pool);
        buf_sz = DIV_ROUND_UP(priv->dma_conf.dma_buf_sz, PAGE_SIZE) * PAGE_SIZE;
+       limit = min(priv->dma_conf.dma_rx_size - 1, (unsigned int)limit);
 
        if (netif_msg_rx_status(priv)) {
                void *rx_head;
@@ -5328,10 +5329,10 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
                        len = 0;
                }
 
+read_again:
                if (count >= limit)
                        break;
 
-read_again:
                buf1_len = 0;
                buf2_len = 0;
                entry = next_entry;
index 6c4b642..411898a 100644 (file)
@@ -2063,7 +2063,7 @@ static int prueth_probe(struct platform_device *pdev)
                                       &prueth->shram);
        if (ret) {
                dev_err(dev, "unable to get PRUSS SHRD RAM2: %d\n", ret);
-               pruss_put(prueth->pruss);
+               goto put_pruss;
        }
 
        prueth->sram_pool = of_gen_pool_get(np, "sram", 0);
@@ -2105,10 +2105,7 @@ static int prueth_probe(struct platform_device *pdev)
        prueth->iep1 = icss_iep_get_idx(np, 1);
        if (IS_ERR(prueth->iep1)) {
                ret = dev_err_probe(dev, PTR_ERR(prueth->iep1), "iep1 get failed\n");
-               icss_iep_put(prueth->iep0);
-               prueth->iep0 = NULL;
-               prueth->iep1 = NULL;
-               goto free_pool;
+               goto put_iep0;
        }
 
        if (prueth->pdata.quirk_10m_link_issue) {
@@ -2205,6 +2202,12 @@ netdev_exit:
 exit_iep:
        if (prueth->pdata.quirk_10m_link_issue)
                icss_iep_exit_fw(prueth->iep1);
+       icss_iep_put(prueth->iep1);
+
+put_iep0:
+       icss_iep_put(prueth->iep0);
+       prueth->iep0 = NULL;
+       prueth->iep1 = NULL;
 
 free_pool:
        gen_pool_free(prueth->sram_pool,
@@ -2212,6 +2215,8 @@ free_pool:
 
 put_mem:
        pruss_release_mem_region(prueth->pruss, &prueth->shram);
+
+put_pruss:
        pruss_put(prueth->pruss);
 
 put_cores:
index 21e9cac..2d5b021 100644 (file)
@@ -411,7 +411,7 @@ struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h,
        return addr;
 }
 
-static int ipvlan_process_v4_outbound(struct sk_buff *skb)
+static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb)
 {
        const struct iphdr *ip4h = ip_hdr(skb);
        struct net_device *dev = skb->dev;
@@ -453,13 +453,11 @@ out:
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static int ipvlan_process_v6_outbound(struct sk_buff *skb)
+
+static noinline_for_stack int
+ipvlan_route_v6_outbound(struct net_device *dev, struct sk_buff *skb)
 {
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
-       struct net_device *dev = skb->dev;
-       struct net *net = dev_net(dev);
-       struct dst_entry *dst;
-       int err, ret = NET_XMIT_DROP;
        struct flowi6 fl6 = {
                .flowi6_oif = dev->ifindex,
                .daddr = ip6h->daddr,
@@ -469,27 +467,38 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)
                .flowi6_mark = skb->mark,
                .flowi6_proto = ip6h->nexthdr,
        };
+       struct dst_entry *dst;
+       int err;
 
-       dst = ip6_route_output(net, NULL, &fl6);
-       if (dst->error) {
-               ret = dst->error;
+       dst = ip6_route_output(dev_net(dev), NULL, &fl6);
+       err = dst->error;
+       if (err) {
                dst_release(dst);
-               goto err;
+               return err;
        }
        skb_dst_set(skb, dst);
+       return 0;
+}
+
+static int ipvlan_process_v6_outbound(struct sk_buff *skb)
+{
+       struct net_device *dev = skb->dev;
+       int err, ret = NET_XMIT_DROP;
+
+       err = ipvlan_route_v6_outbound(dev, skb);
+       if (unlikely(err)) {
+               DEV_STATS_INC(dev, tx_errors);
+               kfree_skb(skb);
+               return err;
+       }
 
        memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
 
-       err = ip6_local_out(net, skb->sk, skb);
+       err = ip6_local_out(dev_net(dev), skb->sk, skb);
        if (unlikely(net_xmit_eval(err)))
                DEV_STATS_INC(dev, tx_errors);
        else
                ret = NET_XMIT_SUCCESS;
-       goto out;
-err:
-       DEV_STATS_INC(dev, tx_errors);
-       kfree_skb(skb);
-out:
        return ret;
 }
 #else
index 02bd201..c8da94a 100644 (file)
@@ -780,7 +780,7 @@ static void macvlan_change_rx_flags(struct net_device *dev, int change)
        if (dev->flags & IFF_UP) {
                if (change & IFF_ALLMULTI)
                        dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1);
-               if (change & IFF_PROMISC)
+               if (!macvlan_passthru(vlan->port) && change & IFF_PROMISC)
                        dev_set_promiscuity(lowerdev,
                                            dev->flags & IFF_PROMISC ? 1 : -1);
 
index ebcdffd..52d05ce 100644 (file)
@@ -453,6 +453,10 @@ ppp_sync_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg)
        case PPPIOCSMRU:
                if (get_user(val, (int __user *) argp))
                        break;
+               if (val > U16_MAX) {
+                       err = -EINVAL;
+                       break;
+               }
                if (val < PPP_MRU)
                        val = PPP_MRU;
                ap->mru = val;
@@ -687,7 +691,7 @@ ppp_sync_input(struct syncppp *ap, const u8 *buf, const u8 *flags, int count)
 
        /* strip address/control field if present */
        p = skb->data;
-       if (p[0] == PPP_ALLSTATIONS && p[1] == PPP_UI) {
+       if (skb->len >= 2 && p[0] == PPP_ALLSTATIONS && p[1] == PPP_UI) {
                /* chop off address/control */
                if (skb->len < 3)
                        goto err;
index 539d892..bb0d924 100644 (file)
@@ -176,7 +176,7 @@ static struct notifier_block parisc_panic_block = {
 static int qemu_power_off(struct sys_off_data *data)
 {
        /* this turns the system off via SeaBIOS */
-       *(int *)data->cb_data = 0;
+       gsc_writel(0, (unsigned long) data->cb_data);
        pdc_soft_power_button(1);
        return NOTIFY_DONE;
 }
index 3f7a747..7513018 100644 (file)
@@ -572,7 +572,8 @@ ssize_t ptp_read(struct posix_clock_context *pccontext, uint rdflags,
 
        for (i = 0; i < cnt; i++) {
                event[i] = queue->buf[queue->head];
-               queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS;
+               /* Paired with READ_ONCE() in queue_cnt() */
+               WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS);
        }
 
        spin_unlock_irqrestore(&queue->lock, flags);
index 3134568..15b804b 100644 (file)
@@ -57,10 +57,11 @@ static void enqueue_external_timestamp(struct timestamp_event_queue *queue,
        dst->t.sec = seconds;
        dst->t.nsec = remainder;
 
+       /* Both WRITE_ONCE() are paired with READ_ONCE() in queue_cnt() */
        if (!queue_free(queue))
-               queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS;
+               WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS);
 
-       queue->tail = (queue->tail + 1) % PTP_MAX_TIMESTAMPS;
+       WRITE_ONCE(queue->tail, (queue->tail + 1) % PTP_MAX_TIMESTAMPS);
 
        spin_unlock_irqrestore(&queue->lock, flags);
 }
index 35fde0a..45f9002 100644 (file)
@@ -85,9 +85,13 @@ struct ptp_vclock {
  * that a writer might concurrently increment the tail does not
  * matter, since the queue remains nonempty nonetheless.
  */
-static inline int queue_cnt(struct timestamp_event_queue *q)
+static inline int queue_cnt(const struct timestamp_event_queue *q)
 {
-       int cnt = q->tail - q->head;
+       /*
+        * Paired with WRITE_ONCE() in enqueue_external_timestamp(),
+        * ptp_read(), extts_fifo_show().
+        */
+       int cnt = READ_ONCE(q->tail) - READ_ONCE(q->head);
        return cnt < 0 ? PTP_MAX_TIMESTAMPS + cnt : cnt;
 }
 
index 7d023d9..f7a499a 100644 (file)
@@ -94,7 +94,8 @@ static ssize_t extts_fifo_show(struct device *dev,
        qcnt = queue_cnt(queue);
        if (qcnt) {
                event = queue->buf[queue->head];
-               queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS;
+               /* Paired with READ_ONCE() in queue_cnt() */
+               WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS);
        }
        spin_unlock_irqrestore(&queue->lock, flags);
 
index 32d1e73..03348f6 100644 (file)
@@ -1837,8 +1837,16 @@ static void qla2x00_abort_srb(struct qla_qpair *qp, srb_t *sp, const int res,
                }
 
                spin_lock_irqsave(qp->qp_lock_ptr, *flags);
-               if (ret_cmd && blk_mq_request_started(scsi_cmd_to_rq(cmd)))
-                       sp->done(sp, res);
+               switch (sp->type) {
+               case SRB_SCSI_CMD:
+                       if (ret_cmd && blk_mq_request_started(scsi_cmd_to_rq(cmd)))
+                               sp->done(sp, res);
+                       break;
+               default:
+                       if (ret_cmd)
+                               sp->done(sp, res);
+                       break;
+               }
        } else {
                sp->done(sp, res);
        }
index 67922e2..6d8218a 100644 (file)
@@ -1019,7 +1019,7 @@ static ssize_t sdebug_error_write(struct file *file, const char __user *ubuf,
        struct sdebug_err_inject *inject;
        struct scsi_device *sdev = (struct scsi_device *)file->f_inode->i_private;
 
-       buf = kmalloc(count, GFP_KERNEL);
+       buf = kzalloc(count + 1, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
 
@@ -1132,7 +1132,6 @@ static const struct file_operations sdebug_target_reset_fail_fops = {
 static int sdebug_target_alloc(struct scsi_target *starget)
 {
        struct sdebug_target_info *targetip;
-       struct dentry *dentry;
 
        targetip = kzalloc(sizeof(struct sdebug_target_info), GFP_KERNEL);
        if (!targetip)
@@ -1140,15 +1139,9 @@ static int sdebug_target_alloc(struct scsi_target *starget)
 
        targetip->debugfs_entry = debugfs_create_dir(dev_name(&starget->dev),
                                sdebug_debugfs_root);
-       if (IS_ERR_OR_NULL(targetip->debugfs_entry))
-               pr_info("%s: failed to create debugfs directory for target %s\n",
-                       __func__, dev_name(&starget->dev));
 
        debugfs_create_file("fail_reset", 0600, targetip->debugfs_entry, starget,
                                &sdebug_target_reset_fail_fops);
-       if (IS_ERR_OR_NULL(dentry))
-               pr_info("%s: failed to create fail_reset file for target %s\n",
-                       __func__, dev_name(&starget->dev));
 
        starget->hostdata = targetip;
 
index 530918c..fa00dd5 100644 (file)
@@ -1643,24 +1643,21 @@ out:
        return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 }
 
-static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr)
+static int sd_sync_cache(struct scsi_disk *sdkp)
 {
        int retries, res;
        struct scsi_device *sdp = sdkp->device;
        const int timeout = sdp->request_queue->rq_timeout
                * SD_FLUSH_TIMEOUT_MULTIPLIER;
-       struct scsi_sense_hdr my_sshdr;
+       struct scsi_sense_hdr sshdr;
        const struct scsi_exec_args exec_args = {
                .req_flags = BLK_MQ_REQ_PM,
-               /* caller might not be interested in sense, but we need it */
-               .sshdr = sshdr ? : &my_sshdr,
+               .sshdr = &sshdr,
        };
 
        if (!scsi_device_online(sdp))
                return -ENODEV;
 
-       sshdr = exec_args.sshdr;
-
        for (retries = 3; retries > 0; --retries) {
                unsigned char cmd[16] = { 0 };
 
@@ -1685,15 +1682,23 @@ static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr)
                        return res;
 
                if (scsi_status_is_check_condition(res) &&
-                   scsi_sense_valid(sshdr)) {
-                       sd_print_sense_hdr(sdkp, sshdr);
+                   scsi_sense_valid(&sshdr)) {
+                       sd_print_sense_hdr(sdkp, &sshdr);
 
                        /* we need to evaluate the error return  */
-                       if (sshdr->asc == 0x3a ||       /* medium not present */
-                           sshdr->asc == 0x20 ||       /* invalid command */
-                           (sshdr->asc == 0x74 && sshdr->ascq == 0x71))        /* drive is password locked */
+                       if (sshdr.asc == 0x3a ||        /* medium not present */
+                           sshdr.asc == 0x20 ||        /* invalid command */
+                           (sshdr.asc == 0x74 && sshdr.ascq == 0x71))  /* drive is password locked */
                                /* this is no error here */
                                return 0;
+                       /*
+                        * This drive doesn't support sync and there's not much
+                        * we can do because this is called during shutdown
+                        * or suspend so just return success so those operations
+                        * can proceed.
+                        */
+                       if (sshdr.sense_key == ILLEGAL_REQUEST)
+                               return 0;
                }
 
                switch (host_byte(res)) {
@@ -3853,7 +3858,7 @@ static void sd_shutdown(struct device *dev)
 
        if (sdkp->WCE && sdkp->media_present) {
                sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");
-               sd_sync_cache(sdkp, NULL);
+               sd_sync_cache(sdkp);
        }
 
        if ((system_state != SYSTEM_RESTART &&
@@ -3874,7 +3879,6 @@ static inline bool sd_do_start_stop(struct scsi_device *sdev, bool runtime)
 static int sd_suspend_common(struct device *dev, bool runtime)
 {
        struct scsi_disk *sdkp = dev_get_drvdata(dev);
-       struct scsi_sense_hdr sshdr;
        int ret = 0;
 
        if (!sdkp)      /* E.g.: runtime suspend following sd_remove() */
@@ -3883,24 +3887,13 @@ static int sd_suspend_common(struct device *dev, bool runtime)
        if (sdkp->WCE && sdkp->media_present) {
                if (!sdkp->device->silence_suspend)
                        sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");
-               ret = sd_sync_cache(sdkp, &sshdr);
-
-               if (ret) {
-                       /* ignore OFFLINE device */
-                       if (ret == -ENODEV)
-                               return 0;
-
-                       if (!scsi_sense_valid(&sshdr) ||
-                           sshdr.sense_key != ILLEGAL_REQUEST)
-                               return ret;
+               ret = sd_sync_cache(sdkp);
+               /* ignore OFFLINE device */
+               if (ret == -ENODEV)
+                       return 0;
 
-                       /*
-                        * sshdr.sense_key == ILLEGAL_REQUEST means this drive
-                        * doesn't support sync. There's not much to do and
-                        * suspend shouldn't fail.
-                        */
-                       ret = 0;
-               }
+               if (ret)
+                       return ret;
        }
 
        if (sd_do_start_stop(sdkp->device, runtime)) {
index 2ba8ec2..0787456 100644 (file)
@@ -436,7 +436,7 @@ int ufshcd_mcq_init(struct ufs_hba *hba)
 
        for (i = 0; i < hba->nr_hw_queues; i++) {
                hwq = &hba->uhq[i];
-               hwq->max_entries = hba->nutrs;
+               hwq->max_entries = hba->nutrs + 1;
                spin_lock_init(&hwq->sq_lock);
                spin_lock_init(&hwq->cq_lock);
                mutex_init(&hwq->sq_mutex);
@@ -630,6 +630,7 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd)
        int tag = scsi_cmd_to_rq(cmd)->tag;
        struct ufshcd_lrb *lrbp = &hba->lrb[tag];
        struct ufs_hw_queue *hwq;
+       unsigned long flags;
        int err = FAILED;
 
        if (!ufshcd_cmd_inflight(lrbp->cmd)) {
@@ -670,8 +671,10 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd)
        }
 
        err = SUCCESS;
+       spin_lock_irqsave(&hwq->cq_lock, flags);
        if (ufshcd_cmd_inflight(lrbp->cmd))
                ufshcd_release_scsi_cmd(hba, lrbp);
+       spin_unlock_irqrestore(&hwq->cq_lock, flags);
 
 out:
        return err;
index b3a3cb1..b137f36 100644 (file)
@@ -437,7 +437,7 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
        if (blk->shared_backend) {
                blk->buffer = shared_buffer;
        } else {
-               blk->buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
+               blk->buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
                                       GFP_KERNEL);
                if (!blk->buffer) {
                        ret = -ENOMEM;
@@ -495,7 +495,7 @@ static int __init vdpasim_blk_init(void)
                goto parent_err;
 
        if (shared_backend) {
-               shared_buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
+               shared_buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT,
                                         GFP_KERNEL);
                if (!shared_buffer) {
                        ret = -ENOMEM;
index 30df5c5..da7ec77 100644 (file)
@@ -1582,7 +1582,6 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
 
 err:
        put_device(&v->dev);
-       ida_simple_remove(&vhost_vdpa_ida, v->minor);
        return r;
 }
 
index c2524a7..7a55939 100644 (file)
@@ -242,7 +242,7 @@ void vp_del_vqs(struct virtio_device *vdev)
                        if (v != VIRTIO_MSI_NO_VECTOR) {
                                int irq = pci_irq_vector(vp_dev->pci_dev, v);
 
-                               irq_set_affinity_hint(irq, NULL);
+                               irq_update_affinity_hint(irq, NULL);
                                free_irq(irq, vq);
                        }
                }
@@ -443,10 +443,10 @@ int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask)
                mask = vp_dev->msix_affinity_masks[info->msix_vector];
                irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
                if (!cpu_mask)
-                       irq_set_affinity_hint(irq, NULL);
+                       irq_update_affinity_hint(irq, NULL);
                else {
                        cpumask_copy(mask, cpu_mask);
-                       irq_set_affinity_hint(irq, mask);
+                       irq_set_affinity_and_hint(irq, mask);
                }
        }
        return 0;
index e2a1fe7..7de8b1e 100644 (file)
@@ -294,9 +294,10 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
 
        err = -EINVAL;
        mdev->common = vp_modern_map_capability(mdev, common,
-                                     sizeof(struct virtio_pci_common_cfg), 4,
-                                     0, sizeof(struct virtio_pci_modern_common_cfg),
-                                     &mdev->common_len, NULL);
+                             sizeof(struct virtio_pci_common_cfg), 4, 0,
+                             offsetofend(struct virtio_pci_modern_common_cfg,
+                                         queue_reset),
+                             &mdev->common_len, NULL);
        if (!mdev->common)
                goto err_map_common;
        mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1,
index b8f2f97..e358533 100644 (file)
@@ -171,11 +171,11 @@ static void evtchn_2l_handle_events(unsigned cpu, struct evtchn_loop_ctrl *ctrl)
        int i;
        struct shared_info *s = HYPERVISOR_shared_info;
        struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+       evtchn_port_t evtchn;
 
        /* Timer interrupt has highest priority. */
-       irq = irq_from_virq(cpu, VIRQ_TIMER);
+       irq = irq_evtchn_from_virq(cpu, VIRQ_TIMER, &evtchn);
        if (irq != -1) {
-               evtchn_port_t evtchn = evtchn_from_irq(irq);
                word_idx = evtchn / BITS_PER_LONG;
                bit_idx = evtchn % BITS_PER_LONG;
                if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx))
@@ -328,9 +328,9 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
        for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) {
                if (sync_test_bit(i, BM(sh->evtchn_pending))) {
                        int word_idx = i / BITS_PER_EVTCHN_WORD;
-                       printk("  %d: event %d -> irq %d%s%s%s\n",
+                       printk("  %d: event %d -> irq %u%s%s%s\n",
                               cpu_from_evtchn(i), i,
-                              get_evtchn_to_irq(i),
+                              irq_from_evtchn(i),
                               sync_test_bit(word_idx, BM(&v->evtchn_pending_sel))
                               ? "" : " l2-clear",
                               !sync_test_bit(i, BM(sh->evtchn_mask))
index 6de6b08..f5edb9e 100644 (file)
@@ -164,6 +164,8 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
 
 /* IRQ <-> IPI mapping */
 static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
+/* Cache for IPI event channels - needed for hot cpu unplug (avoid RCU usage). */
+static DEFINE_PER_CPU(evtchn_port_t [XEN_NR_IPIS], ipi_to_evtchn) = {[0 ... XEN_NR_IPIS-1] = 0};
 
 /* Event channel distribution data */
 static atomic_t channels_on_cpu[NR_CPUS];
@@ -172,7 +174,7 @@ static int **evtchn_to_irq;
 #ifdef CONFIG_X86
 static unsigned long *pirq_eoi_map;
 #endif
-static bool (*pirq_needs_eoi)(unsigned irq);
+static bool (*pirq_needs_eoi)(struct irq_info *info);
 
 #define EVTCHN_ROW(e)  (e / (PAGE_SIZE/sizeof(**evtchn_to_irq)))
 #define EVTCHN_COL(e)  (e % (PAGE_SIZE/sizeof(**evtchn_to_irq)))
@@ -188,7 +190,6 @@ static struct irq_chip xen_lateeoi_chip;
 static struct irq_chip xen_percpu_chip;
 static struct irq_chip xen_pirq_chip;
 static void enable_dynirq(struct irq_data *data);
-static void disable_dynirq(struct irq_data *data);
 
 static DEFINE_PER_CPU(unsigned int, irq_epoch);
 
@@ -246,15 +247,6 @@ static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq)
        return 0;
 }
 
-int get_evtchn_to_irq(evtchn_port_t evtchn)
-{
-       if (evtchn >= xen_evtchn_max_channels())
-               return -1;
-       if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL)
-               return -1;
-       return READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]);
-}
-
 /* Get info for IRQ */
 static struct irq_info *info_for_irq(unsigned irq)
 {
@@ -272,6 +264,19 @@ static void set_info_for_irq(unsigned int irq, struct irq_info *info)
                irq_set_chip_data(irq, info);
 }
 
+static struct irq_info *evtchn_to_info(evtchn_port_t evtchn)
+{
+       int irq;
+
+       if (evtchn >= xen_evtchn_max_channels())
+               return NULL;
+       if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL)
+               return NULL;
+       irq = READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]);
+
+       return (irq < 0) ? NULL : info_for_irq(irq);
+}
+
 /* Per CPU channel accounting */
 static void channels_on_cpu_dec(struct irq_info *info)
 {
@@ -298,6 +303,13 @@ static void channels_on_cpu_inc(struct irq_info *info)
        info->is_accounted = 1;
 }
 
+static void xen_irq_free_desc(unsigned int irq)
+{
+       /* Legacy IRQ descriptors are managed by the arch. */
+       if (irq >= nr_legacy_irqs())
+               irq_free_desc(irq);
+}
+
 static void delayed_free_irq(struct work_struct *work)
 {
        struct irq_info *info = container_of(to_rcu_work(work), struct irq_info,
@@ -309,14 +321,11 @@ static void delayed_free_irq(struct work_struct *work)
 
        kfree(info);
 
-       /* Legacy IRQ descriptors are managed by the arch. */
-       if (irq >= nr_legacy_irqs())
-               irq_free_desc(irq);
+       xen_irq_free_desc(irq);
 }
 
 /* Constructors for packed IRQ information. */
 static int xen_irq_info_common_setup(struct irq_info *info,
-                                    unsigned irq,
                                     enum xen_irq_type type,
                                     evtchn_port_t evtchn,
                                     unsigned short cpu)
@@ -326,29 +335,27 @@ static int xen_irq_info_common_setup(struct irq_info *info,
        BUG_ON(info->type != IRQT_UNBOUND && info->type != type);
 
        info->type = type;
-       info->irq = irq;
        info->evtchn = evtchn;
        info->cpu = cpu;
        info->mask_reason = EVT_MASK_REASON_EXPLICIT;
        raw_spin_lock_init(&info->lock);
 
-       ret = set_evtchn_to_irq(evtchn, irq);
+       ret = set_evtchn_to_irq(evtchn, info->irq);
        if (ret < 0)
                return ret;
 
-       irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN);
+       irq_clear_status_flags(info->irq, IRQ_NOREQUEST | IRQ_NOAUTOEN);
 
        return xen_evtchn_port_setup(evtchn);
 }
 
-static int xen_irq_info_evtchn_setup(unsigned irq,
+static int xen_irq_info_evtchn_setup(struct irq_info *info,
                                     evtchn_port_t evtchn,
                                     struct xenbus_device *dev)
 {
-       struct irq_info *info = info_for_irq(irq);
        int ret;
 
-       ret = xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0);
+       ret = xen_irq_info_common_setup(info, IRQT_EVTCHN, evtchn, 0);
        info->u.interdomain = dev;
        if (dev)
                atomic_inc(&dev->event_channels);
@@ -356,49 +363,37 @@ static int xen_irq_info_evtchn_setup(unsigned irq,
        return ret;
 }
 
-static int xen_irq_info_ipi_setup(unsigned cpu,
-                                 unsigned irq,
-                                 evtchn_port_t evtchn,
-                                 enum ipi_vector ipi)
+static int xen_irq_info_ipi_setup(struct irq_info *info, unsigned int cpu,
+                                 evtchn_port_t evtchn, enum ipi_vector ipi)
 {
-       struct irq_info *info = info_for_irq(irq);
-
        info->u.ipi = ipi;
 
-       per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+       per_cpu(ipi_to_irq, cpu)[ipi] = info->irq;
+       per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn;
 
-       return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0);
+       return xen_irq_info_common_setup(info, IRQT_IPI, evtchn, 0);
 }
 
-static int xen_irq_info_virq_setup(unsigned cpu,
-                                  unsigned irq,
-                                  evtchn_port_t evtchn,
-                                  unsigned virq)
+static int xen_irq_info_virq_setup(struct irq_info *info, unsigned int cpu,
+                                  evtchn_port_t evtchn, unsigned int virq)
 {
-       struct irq_info *info = info_for_irq(irq);
-
        info->u.virq = virq;
 
-       per_cpu(virq_to_irq, cpu)[virq] = irq;
+       per_cpu(virq_to_irq, cpu)[virq] = info->irq;
 
-       return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0);
+       return xen_irq_info_common_setup(info, IRQT_VIRQ, evtchn, 0);
 }
 
-static int xen_irq_info_pirq_setup(unsigned irq,
-                                  evtchn_port_t evtchn,
-                                  unsigned pirq,
-                                  unsigned gsi,
-                                  uint16_t domid,
-                                  unsigned char flags)
+static int xen_irq_info_pirq_setup(struct irq_info *info, evtchn_port_t evtchn,
+                                  unsigned int pirq, unsigned int gsi,
+                                  uint16_t domid, unsigned char flags)
 {
-       struct irq_info *info = info_for_irq(irq);
-
        info->u.pirq.pirq = pirq;
        info->u.pirq.gsi = gsi;
        info->u.pirq.domid = domid;
        info->u.pirq.flags = flags;
 
-       return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0);
+       return xen_irq_info_common_setup(info, IRQT_PIRQ, evtchn, 0);
 }
 
 static void xen_irq_info_cleanup(struct irq_info *info)
@@ -412,7 +407,7 @@ static void xen_irq_info_cleanup(struct irq_info *info)
 /*
  * Accessors for packed IRQ information.
  */
-evtchn_port_t evtchn_from_irq(unsigned irq)
+static evtchn_port_t evtchn_from_irq(unsigned int irq)
 {
        const struct irq_info *info = NULL;
 
@@ -426,64 +421,51 @@ evtchn_port_t evtchn_from_irq(unsigned irq)
 
 unsigned int irq_from_evtchn(evtchn_port_t evtchn)
 {
-       return get_evtchn_to_irq(evtchn);
+       struct irq_info *info = evtchn_to_info(evtchn);
+
+       return info ? info->irq : -1;
 }
 EXPORT_SYMBOL_GPL(irq_from_evtchn);
 
-int irq_from_virq(unsigned int cpu, unsigned int virq)
+int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq,
+                        evtchn_port_t *evtchn)
 {
-       return per_cpu(virq_to_irq, cpu)[virq];
+       int irq = per_cpu(virq_to_irq, cpu)[virq];
+
+       *evtchn = evtchn_from_irq(irq);
+
+       return irq;
 }
 
-static enum ipi_vector ipi_from_irq(unsigned irq)
+static enum ipi_vector ipi_from_irq(struct irq_info *info)
 {
-       struct irq_info *info = info_for_irq(irq);
-
        BUG_ON(info == NULL);
        BUG_ON(info->type != IRQT_IPI);
 
        return info->u.ipi;
 }
 
-static unsigned virq_from_irq(unsigned irq)
+static unsigned int virq_from_irq(struct irq_info *info)
 {
-       struct irq_info *info = info_for_irq(irq);
-
        BUG_ON(info == NULL);
        BUG_ON(info->type != IRQT_VIRQ);
 
        return info->u.virq;
 }
 
-static unsigned pirq_from_irq(unsigned irq)
+static unsigned int pirq_from_irq(struct irq_info *info)
 {
-       struct irq_info *info = info_for_irq(irq);
-
        BUG_ON(info == NULL);
        BUG_ON(info->type != IRQT_PIRQ);
 
        return info->u.pirq.pirq;
 }
 
-static enum xen_irq_type type_from_irq(unsigned irq)
-{
-       return info_for_irq(irq)->type;
-}
-
-static unsigned cpu_from_irq(unsigned irq)
-{
-       return info_for_irq(irq)->cpu;
-}
-
 unsigned int cpu_from_evtchn(evtchn_port_t evtchn)
 {
-       int irq = get_evtchn_to_irq(evtchn);
-       unsigned ret = 0;
-
-       if (irq != -1)
-               ret = cpu_from_irq(irq);
+       struct irq_info *info = evtchn_to_info(evtchn);
 
-       return ret;
+       return info ? info->cpu : 0;
 }
 
 static void do_mask(struct irq_info *info, u8 reason)
@@ -515,36 +497,30 @@ static void do_unmask(struct irq_info *info, u8 reason)
 }
 
 #ifdef CONFIG_X86
-static bool pirq_check_eoi_map(unsigned irq)
+static bool pirq_check_eoi_map(struct irq_info *info)
 {
-       return test_bit(pirq_from_irq(irq), pirq_eoi_map);
+       return test_bit(pirq_from_irq(info), pirq_eoi_map);
 }
 #endif
 
-static bool pirq_needs_eoi_flag(unsigned irq)
+static bool pirq_needs_eoi_flag(struct irq_info *info)
 {
-       struct irq_info *info = info_for_irq(irq);
        BUG_ON(info->type != IRQT_PIRQ);
 
        return info->u.pirq.flags & PIRQ_NEEDS_EOI;
 }
 
-static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu,
+static void bind_evtchn_to_cpu(struct irq_info *info, unsigned int cpu,
                               bool force_affinity)
 {
-       int irq = get_evtchn_to_irq(evtchn);
-       struct irq_info *info = info_for_irq(irq);
-
-       BUG_ON(irq == -1);
-
        if (IS_ENABLED(CONFIG_SMP) && force_affinity) {
-               struct irq_data *data = irq_get_irq_data(irq);
+               struct irq_data *data = irq_get_irq_data(info->irq);
 
                irq_data_update_affinity(data, cpumask_of(cpu));
                irq_data_update_effective_affinity(data, cpumask_of(cpu));
        }
 
-       xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu);
+       xen_evtchn_port_bind_to_cpu(info->evtchn, cpu, info->cpu);
 
        channels_on_cpu_dec(info);
        info->cpu = cpu;
@@ -601,7 +577,9 @@ static void lateeoi_list_add(struct irq_info *info)
 
        spin_lock_irqsave(&eoi->eoi_list_lock, flags);
 
-       if (list_empty(&eoi->eoi_list)) {
+       elem = list_first_entry_or_null(&eoi->eoi_list, struct irq_info,
+                                       eoi_list);
+       if (!elem || info->eoi_time < elem->eoi_time) {
                list_add(&info->eoi_list, &eoi->eoi_list);
                mod_delayed_work_on(info->eoi_cpu, system_wq,
                                    &eoi->delayed, delay);
@@ -732,50 +710,49 @@ void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags)
 }
 EXPORT_SYMBOL_GPL(xen_irq_lateeoi);
 
-static void xen_irq_init(unsigned irq)
+static struct irq_info *xen_irq_init(unsigned int irq)
 {
        struct irq_info *info;
 
        info = kzalloc(sizeof(*info), GFP_KERNEL);
-       if (info == NULL)
-               panic("Unable to allocate metadata for IRQ%d\n", irq);
+       if (info) {
+               info->irq = irq;
+               info->type = IRQT_UNBOUND;
+               info->refcnt = -1;
+               INIT_RCU_WORK(&info->rwork, delayed_free_irq);
 
-       info->type = IRQT_UNBOUND;
-       info->refcnt = -1;
-       INIT_RCU_WORK(&info->rwork, delayed_free_irq);
+               set_info_for_irq(irq, info);
+               /*
+                * Interrupt affinity setting can be immediate. No point
+                * in delaying it until an interrupt is handled.
+                */
+               irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 
-       set_info_for_irq(irq, info);
-       /*
-        * Interrupt affinity setting can be immediate. No point
-        * in delaying it until an interrupt is handled.
-        */
-       irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+               INIT_LIST_HEAD(&info->eoi_list);
+               list_add_tail(&info->list, &xen_irq_list_head);
+       }
 
-       INIT_LIST_HEAD(&info->eoi_list);
-       list_add_tail(&info->list, &xen_irq_list_head);
+       return info;
 }
 
-static int __must_check xen_allocate_irqs_dynamic(int nvec)
+static struct irq_info *xen_allocate_irq_dynamic(void)
 {
-       int i, irq = irq_alloc_descs(-1, 0, nvec, -1);
+       int irq = irq_alloc_desc_from(0, -1);
+       struct irq_info *info = NULL;
 
        if (irq >= 0) {
-               for (i = 0; i < nvec; i++)
-                       xen_irq_init(irq + i);
+               info = xen_irq_init(irq);
+               if (!info)
+                       xen_irq_free_desc(irq);
        }
 
-       return irq;
-}
-
-static inline int __must_check xen_allocate_irq_dynamic(void)
-{
-
-       return xen_allocate_irqs_dynamic(1);
+       return info;
 }
 
-static int __must_check xen_allocate_irq_gsi(unsigned gsi)
+static struct irq_info *xen_allocate_irq_gsi(unsigned int gsi)
 {
        int irq;
+       struct irq_info *info;
 
        /*
         * A PV guest has no concept of a GSI (since it has no ACPI
@@ -792,15 +769,15 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi)
        else
                irq = irq_alloc_desc_at(gsi, -1);
 
-       xen_irq_init(irq);
+       info = xen_irq_init(irq);
+       if (!info)
+               xen_irq_free_desc(irq);
 
-       return irq;
+       return info;
 }
 
-static void xen_free_irq(unsigned irq)
+static void xen_free_irq(struct irq_info *info)
 {
-       struct irq_info *info = info_for_irq(irq);
-
        if (WARN_ON(!info))
                return;
 
@@ -821,14 +798,11 @@ static void event_handler_exit(struct irq_info *info)
        clear_evtchn(info->evtchn);
 }
 
-static void pirq_query_unmask(int irq)
+static void pirq_query_unmask(struct irq_info *info)
 {
        struct physdev_irq_status_query irq_status;
-       struct irq_info *info = info_for_irq(irq);
-
-       BUG_ON(info->type != IRQT_PIRQ);
 
-       irq_status.irq = pirq_from_irq(irq);
+       irq_status.irq = pirq_from_irq(info);
        if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
                irq_status.flags = 0;
 
@@ -837,61 +811,81 @@ static void pirq_query_unmask(int irq)
                info->u.pirq.flags |= PIRQ_NEEDS_EOI;
 }
 
-static void eoi_pirq(struct irq_data *data)
+static void do_eoi_pirq(struct irq_info *info)
 {
-       struct irq_info *info = info_for_irq(data->irq);
-       evtchn_port_t evtchn = info ? info->evtchn : 0;
-       struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) };
+       struct physdev_eoi eoi = { .irq = pirq_from_irq(info) };
        int rc = 0;
 
-       if (!VALID_EVTCHN(evtchn))
+       if (!VALID_EVTCHN(info->evtchn))
                return;
 
        event_handler_exit(info);
 
-       if (pirq_needs_eoi(data->irq)) {
+       if (pirq_needs_eoi(info)) {
                rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
                WARN_ON(rc);
        }
 }
 
+static void eoi_pirq(struct irq_data *data)
+{
+       struct irq_info *info = info_for_irq(data->irq);
+
+       do_eoi_pirq(info);
+}
+
+static void do_disable_dynirq(struct irq_info *info)
+{
+       if (VALID_EVTCHN(info->evtchn))
+               do_mask(info, EVT_MASK_REASON_EXPLICIT);
+}
+
+static void disable_dynirq(struct irq_data *data)
+{
+       struct irq_info *info = info_for_irq(data->irq);
+
+       if (info)
+               do_disable_dynirq(info);
+}
+
 static void mask_ack_pirq(struct irq_data *data)
 {
-       disable_dynirq(data);
-       eoi_pirq(data);
+       struct irq_info *info = info_for_irq(data->irq);
+
+       if (info) {
+               do_disable_dynirq(info);
+               do_eoi_pirq(info);
+       }
 }
 
-static unsigned int __startup_pirq(unsigned int irq)
+static unsigned int __startup_pirq(struct irq_info *info)
 {
        struct evtchn_bind_pirq bind_pirq;
-       struct irq_info *info = info_for_irq(irq);
-       evtchn_port_t evtchn = evtchn_from_irq(irq);
+       evtchn_port_t evtchn = info->evtchn;
        int rc;
 
-       BUG_ON(info->type != IRQT_PIRQ);
-
        if (VALID_EVTCHN(evtchn))
                goto out;
 
-       bind_pirq.pirq = pirq_from_irq(irq);
+       bind_pirq.pirq = pirq_from_irq(info);
        /* NB. We are happy to share unless we are probing. */
        bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
                                        BIND_PIRQ__WILL_SHARE : 0;
        rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
        if (rc != 0) {
-               pr_warn("Failed to obtain physical IRQ %d\n", irq);
+               pr_warn("Failed to obtain physical IRQ %d\n", info->irq);
                return 0;
        }
        evtchn = bind_pirq.port;
 
-       pirq_query_unmask(irq);
+       pirq_query_unmask(info);
 
-       rc = set_evtchn_to_irq(evtchn, irq);
+       rc = set_evtchn_to_irq(evtchn, info->irq);
        if (rc)
                goto err;
 
        info->evtchn = evtchn;
-       bind_evtchn_to_cpu(evtchn, 0, false);
+       bind_evtchn_to_cpu(info, 0, false);
 
        rc = xen_evtchn_port_setup(evtchn);
        if (rc)
@@ -900,26 +894,28 @@ static unsigned int __startup_pirq(unsigned int irq)
 out:
        do_unmask(info, EVT_MASK_REASON_EXPLICIT);
 
-       eoi_pirq(irq_get_irq_data(irq));
+       do_eoi_pirq(info);
 
        return 0;
 
 err:
-       pr_err("irq%d: Failed to set port to irq mapping (%d)\n", irq, rc);
+       pr_err("irq%d: Failed to set port to irq mapping (%d)\n", info->irq,
+              rc);
        xen_evtchn_close(evtchn);
        return 0;
 }
 
 static unsigned int startup_pirq(struct irq_data *data)
 {
-       return __startup_pirq(data->irq);
+       struct irq_info *info = info_for_irq(data->irq);
+
+       return __startup_pirq(info);
 }
 
 static void shutdown_pirq(struct irq_data *data)
 {
-       unsigned int irq = data->irq;
-       struct irq_info *info = info_for_irq(irq);
-       evtchn_port_t evtchn = evtchn_from_irq(irq);
+       struct irq_info *info = info_for_irq(data->irq);
+       evtchn_port_t evtchn = info->evtchn;
 
        BUG_ON(info->type != IRQT_PIRQ);
 
@@ -957,10 +953,14 @@ int xen_irq_from_gsi(unsigned gsi)
 }
 EXPORT_SYMBOL_GPL(xen_irq_from_gsi);
 
-static void __unbind_from_irq(unsigned int irq)
+static void __unbind_from_irq(struct irq_info *info, unsigned int irq)
 {
-       evtchn_port_t evtchn = evtchn_from_irq(irq);
-       struct irq_info *info = info_for_irq(irq);
+       evtchn_port_t evtchn;
+
+       if (!info) {
+               xen_irq_free_desc(irq);
+               return;
+       }
 
        if (info->refcnt > 0) {
                info->refcnt--;
@@ -968,19 +968,22 @@ static void __unbind_from_irq(unsigned int irq)
                        return;
        }
 
+       evtchn = info->evtchn;
+
        if (VALID_EVTCHN(evtchn)) {
-               unsigned int cpu = cpu_from_irq(irq);
+               unsigned int cpu = info->cpu;
                struct xenbus_device *dev;
 
                if (!info->is_static)
                        xen_evtchn_close(evtchn);
 
-               switch (type_from_irq(irq)) {
+               switch (info->type) {
                case IRQT_VIRQ:
-                       per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1;
+                       per_cpu(virq_to_irq, cpu)[virq_from_irq(info)] = -1;
                        break;
                case IRQT_IPI:
-                       per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1;
+                       per_cpu(ipi_to_irq, cpu)[ipi_from_irq(info)] = -1;
+                       per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(info)] = 0;
                        break;
                case IRQT_EVTCHN:
                        dev = info->u.interdomain;
@@ -994,7 +997,7 @@ static void __unbind_from_irq(unsigned int irq)
                xen_irq_info_cleanup(info);
        }
 
-       xen_free_irq(irq);
+       xen_free_irq(info);
 }
 
 /*
@@ -1010,24 +1013,24 @@ static void __unbind_from_irq(unsigned int irq)
 int xen_bind_pirq_gsi_to_irq(unsigned gsi,
                             unsigned pirq, int shareable, char *name)
 {
-       int irq;
+       struct irq_info *info;
        struct physdev_irq irq_op;
        int ret;
 
        mutex_lock(&irq_mapping_update_lock);
 
-       irq = xen_irq_from_gsi(gsi);
-       if (irq != -1) {
+       ret = xen_irq_from_gsi(gsi);
+       if (ret != -1) {
                pr_info("%s: returning irq %d for gsi %u\n",
-                       __func__, irq, gsi);
+                       __func__, ret, gsi);
                goto out;
        }
 
-       irq = xen_allocate_irq_gsi(gsi);
-       if (irq < 0)
+       info = xen_allocate_irq_gsi(gsi);
+       if (!info)
                goto out;
 
-       irq_op.irq = irq;
+       irq_op.irq = info->irq;
        irq_op.vector = 0;
 
        /* Only the privileged domain can do this. For non-priv, the pcifront
@@ -1035,20 +1038,19 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
         * this in the priv domain. */
        if (xen_initial_domain() &&
            HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
-               xen_free_irq(irq);
-               irq = -ENOSPC;
+               xen_free_irq(info);
+               ret = -ENOSPC;
                goto out;
        }
 
-       ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF,
+       ret = xen_irq_info_pirq_setup(info, 0, pirq, gsi, DOMID_SELF,
                               shareable ? PIRQ_SHAREABLE : 0);
        if (ret < 0) {
-               __unbind_from_irq(irq);
-               irq = ret;
+               __unbind_from_irq(info, info->irq);
                goto out;
        }
 
-       pirq_query_unmask(irq);
+       pirq_query_unmask(info);
        /* We try to use the handler with the appropriate semantic for the
         * type of interrupt: if the interrupt is an edge triggered
         * interrupt we use handle_edge_irq.
@@ -1065,16 +1067,18 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
         * is the right choice either way.
         */
        if (shareable)
-               irq_set_chip_and_handler_name(irq, &xen_pirq_chip,
+               irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip,
                                handle_fasteoi_irq, name);
        else
-               irq_set_chip_and_handler_name(irq, &xen_pirq_chip,
+               irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip,
                                handle_edge_irq, name);
 
+       ret = info->irq;
+
 out:
        mutex_unlock(&irq_mapping_update_lock);
 
-       return irq;
+       return ret;
 }
 
 #ifdef CONFIG_PCI_MSI
@@ -1096,17 +1100,22 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
                             int pirq, int nvec, const char *name, domid_t domid)
 {
        int i, irq, ret;
+       struct irq_info *info;
 
        mutex_lock(&irq_mapping_update_lock);
 
-       irq = xen_allocate_irqs_dynamic(nvec);
+       irq = irq_alloc_descs(-1, 0, nvec, -1);
        if (irq < 0)
                goto out;
 
        for (i = 0; i < nvec; i++) {
+               info = xen_irq_init(irq + i);
+               if (!info)
+                       goto error_irq;
+
                irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name);
 
-               ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid,
+               ret = xen_irq_info_pirq_setup(info, 0, pirq + i, 0, domid,
                                              i == 0 ? 0 : PIRQ_MSI_GROUP);
                if (ret < 0)
                        goto error_irq;
@@ -1118,9 +1127,12 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 out:
        mutex_unlock(&irq_mapping_update_lock);
        return irq;
+
 error_irq:
-       while (nvec--)
-               __unbind_from_irq(irq + nvec);
+       while (nvec--) {
+               info = info_for_irq(irq + nvec);
+               __unbind_from_irq(info, irq + nvec);
+       }
        mutex_unlock(&irq_mapping_update_lock);
        return ret;
 }
@@ -1156,67 +1168,45 @@ int xen_destroy_irq(int irq)
                }
        }
 
-       xen_free_irq(irq);
+       xen_free_irq(info);
 
 out:
        mutex_unlock(&irq_mapping_update_lock);
        return rc;
 }
 
-int xen_irq_from_pirq(unsigned pirq)
-{
-       int irq;
-
-       struct irq_info *info;
-
-       mutex_lock(&irq_mapping_update_lock);
-
-       list_for_each_entry(info, &xen_irq_list_head, list) {
-               if (info->type != IRQT_PIRQ)
-                       continue;
-               irq = info->irq;
-               if (info->u.pirq.pirq == pirq)
-                       goto out;
-       }
-       irq = -1;
-out:
-       mutex_unlock(&irq_mapping_update_lock);
-
-       return irq;
-}
-
-
 int xen_pirq_from_irq(unsigned irq)
 {
-       return pirq_from_irq(irq);
+       struct irq_info *info = info_for_irq(irq);
+
+       return pirq_from_irq(info);
 }
 EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
 
 static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
                                   struct xenbus_device *dev)
 {
-       int irq;
-       int ret;
+       int ret = -ENOMEM;
+       struct irq_info *info;
 
        if (evtchn >= xen_evtchn_max_channels())
                return -ENOMEM;
 
        mutex_lock(&irq_mapping_update_lock);
 
-       irq = get_evtchn_to_irq(evtchn);
+       info = evtchn_to_info(evtchn);
 
-       if (irq == -1) {
-               irq = xen_allocate_irq_dynamic();
-               if (irq < 0)
+       if (!info) {
+               info = xen_allocate_irq_dynamic();
+               if (!info)
                        goto out;
 
-               irq_set_chip_and_handler_name(irq, chip,
+               irq_set_chip_and_handler_name(info->irq, chip,
                                              handle_edge_irq, "event");
 
-               ret = xen_irq_info_evtchn_setup(irq, evtchn, dev);
+               ret = xen_irq_info_evtchn_setup(info, evtchn, dev);
                if (ret < 0) {
-                       __unbind_from_irq(irq);
-                       irq = ret;
+                       __unbind_from_irq(info, info->irq);
                        goto out;
                }
                /*
@@ -1226,17 +1216,17 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
                 * affinity setting is not invoked on them so nothing would
                 * bind the channel.
                 */
-               bind_evtchn_to_cpu(evtchn, 0, false);
-       } else {
-               struct irq_info *info = info_for_irq(irq);
-               if (!WARN_ON(!info || info->type != IRQT_EVTCHN))
-                       info->refcnt++;
+               bind_evtchn_to_cpu(info, 0, false);
+       } else if (!WARN_ON(info->type != IRQT_EVTCHN)) {
+               info->refcnt++;
        }
 
+       ret = info->irq;
+
 out:
        mutex_unlock(&irq_mapping_update_lock);
 
-       return irq;
+       return ret;
 }
 
 int bind_evtchn_to_irq(evtchn_port_t evtchn)
@@ -1255,18 +1245,19 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 {
        struct evtchn_bind_ipi bind_ipi;
        evtchn_port_t evtchn;
-       int ret, irq;
+       struct irq_info *info;
+       int ret;
 
        mutex_lock(&irq_mapping_update_lock);
 
-       irq = per_cpu(ipi_to_irq, cpu)[ipi];
+       ret = per_cpu(ipi_to_irq, cpu)[ipi];
 
-       if (irq == -1) {
-               irq = xen_allocate_irq_dynamic();
-               if (irq < 0)
+       if (ret == -1) {
+               info = xen_allocate_irq_dynamic();
+               if (!info)
                        goto out;
 
-               irq_set_chip_and_handler_name(irq, &xen_percpu_chip,
+               irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip,
                                              handle_percpu_irq, "ipi");
 
                bind_ipi.vcpu = xen_vcpu_nr(cpu);
@@ -1275,25 +1266,25 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
                        BUG();
                evtchn = bind_ipi.port;
 
-               ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
+               ret = xen_irq_info_ipi_setup(info, cpu, evtchn, ipi);
                if (ret < 0) {
-                       __unbind_from_irq(irq);
-                       irq = ret;
+                       __unbind_from_irq(info, info->irq);
                        goto out;
                }
                /*
                 * Force the affinity mask to the target CPU so proc shows
                 * the correct target.
                 */
-               bind_evtchn_to_cpu(evtchn, cpu, true);
+               bind_evtchn_to_cpu(info, cpu, true);
+               ret = info->irq;
        } else {
-               struct irq_info *info = info_for_irq(irq);
+               info = info_for_irq(ret);
                WARN_ON(info == NULL || info->type != IRQT_IPI);
        }
 
  out:
        mutex_unlock(&irq_mapping_update_lock);
-       return irq;
+       return ret;
 }
 
 static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev,
@@ -1361,22 +1352,23 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
 {
        struct evtchn_bind_virq bind_virq;
        evtchn_port_t evtchn = 0;
-       int irq, ret;
+       struct irq_info *info;
+       int ret;
 
        mutex_lock(&irq_mapping_update_lock);
 
-       irq = per_cpu(virq_to_irq, cpu)[virq];
+       ret = per_cpu(virq_to_irq, cpu)[virq];
 
-       if (irq == -1) {
-               irq = xen_allocate_irq_dynamic();
-               if (irq < 0)
+       if (ret == -1) {
+               info = xen_allocate_irq_dynamic();
+               if (!info)
                        goto out;
 
                if (percpu)
-                       irq_set_chip_and_handler_name(irq, &xen_percpu_chip,
+                       irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip,
                                                      handle_percpu_irq, "virq");
                else
-                       irq_set_chip_and_handler_name(irq, &xen_dynamic_chip,
+                       irq_set_chip_and_handler_name(info->irq, &xen_dynamic_chip,
                                                      handle_edge_irq, "virq");
 
                bind_virq.virq = virq;
@@ -1391,10 +1383,9 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
                        BUG_ON(ret < 0);
                }
 
-               ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
+               ret = xen_irq_info_virq_setup(info, cpu, evtchn, virq);
                if (ret < 0) {
-                       __unbind_from_irq(irq);
-                       irq = ret;
+                       __unbind_from_irq(info, info->irq);
                        goto out;
                }
 
@@ -1402,22 +1393,26 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
                 * Force the affinity mask for percpu interrupts so proc
                 * shows the correct target.
                 */
-               bind_evtchn_to_cpu(evtchn, cpu, percpu);
+               bind_evtchn_to_cpu(info, cpu, percpu);
+               ret = info->irq;
        } else {
-               struct irq_info *info = info_for_irq(irq);
+               info = info_for_irq(ret);
                WARN_ON(info == NULL || info->type != IRQT_VIRQ);
        }
 
 out:
        mutex_unlock(&irq_mapping_update_lock);
 
-       return irq;
+       return ret;
 }
 
 static void unbind_from_irq(unsigned int irq)
 {
+       struct irq_info *info;
+
        mutex_lock(&irq_mapping_update_lock);
-       __unbind_from_irq(irq);
+       info = info_for_irq(irq);
+       __unbind_from_irq(info, irq);
        mutex_unlock(&irq_mapping_update_lock);
 }
 
@@ -1568,13 +1563,7 @@ EXPORT_SYMBOL_GPL(xen_set_irq_priority);
 
 int evtchn_make_refcounted(evtchn_port_t evtchn, bool is_static)
 {
-       int irq = get_evtchn_to_irq(evtchn);
-       struct irq_info *info;
-
-       if (irq == -1)
-               return -ENOENT;
-
-       info = info_for_irq(irq);
+       struct irq_info *info = evtchn_to_info(evtchn);
 
        if (!info)
                return -ENOENT;
@@ -1590,7 +1579,6 @@ EXPORT_SYMBOL_GPL(evtchn_make_refcounted);
 
 int evtchn_get(evtchn_port_t evtchn)
 {
-       int irq;
        struct irq_info *info;
        int err = -ENOENT;
 
@@ -1599,11 +1587,7 @@ int evtchn_get(evtchn_port_t evtchn)
 
        mutex_lock(&irq_mapping_update_lock);
 
-       irq = get_evtchn_to_irq(evtchn);
-       if (irq == -1)
-               goto done;
-
-       info = info_for_irq(irq);
+       info = evtchn_to_info(evtchn);
 
        if (!info)
                goto done;
@@ -1623,16 +1607,17 @@ EXPORT_SYMBOL_GPL(evtchn_get);
 
 void evtchn_put(evtchn_port_t evtchn)
 {
-       int irq = get_evtchn_to_irq(evtchn);
-       if (WARN_ON(irq == -1))
+       struct irq_info *info = evtchn_to_info(evtchn);
+
+       if (WARN_ON(!info))
                return;
-       unbind_from_irq(irq);
+       unbind_from_irq(info->irq);
 }
 EXPORT_SYMBOL_GPL(evtchn_put);
 
 void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
 {
-       int irq;
+       evtchn_port_t evtchn;
 
 #ifdef CONFIG_X86
        if (unlikely(vector == XEN_NMI_VECTOR)) {
@@ -1643,9 +1628,9 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
                return;
        }
 #endif
-       irq = per_cpu(ipi_to_irq, cpu)[vector];
-       BUG_ON(irq < 0);
-       notify_remote_via_irq(irq);
+       evtchn = per_cpu(ipi_to_evtchn, cpu)[vector];
+       BUG_ON(evtchn == 0);
+       notify_remote_via_evtchn(evtchn);
 }
 
 struct evtchn_loop_ctrl {
@@ -1656,12 +1641,10 @@ struct evtchn_loop_ctrl {
 
 void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
 {
-       int irq;
-       struct irq_info *info;
+       struct irq_info *info = evtchn_to_info(port);
        struct xenbus_device *dev;
 
-       irq = get_evtchn_to_irq(port);
-       if (irq == -1)
+       if (!info)
                return;
 
        /*
@@ -1686,7 +1669,6 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
                }
        }
 
-       info = info_for_irq(irq);
        if (xchg_acquire(&info->is_active, 1))
                return;
 
@@ -1700,7 +1682,7 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
                info->eoi_time = get_jiffies_64() + event_eoi_delay;
        }
 
-       generic_handle_irq(irq);
+       generic_handle_irq(info->irq);
 }
 
 int xen_evtchn_do_upcall(void)
@@ -1758,16 +1740,17 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq)
        mutex_lock(&irq_mapping_update_lock);
 
        /* After resume the irq<->evtchn mappings are all cleared out */
-       BUG_ON(get_evtchn_to_irq(evtchn) != -1);
+       BUG_ON(evtchn_to_info(evtchn));
        /* Expect irq to have been bound before,
           so there should be a proper type */
        BUG_ON(info->type == IRQT_UNBOUND);
 
-       (void)xen_irq_info_evtchn_setup(irq, evtchn, NULL);
+       info->irq = irq;
+       (void)xen_irq_info_evtchn_setup(info, evtchn, NULL);
 
        mutex_unlock(&irq_mapping_update_lock);
 
-       bind_evtchn_to_cpu(evtchn, info->cpu, false);
+       bind_evtchn_to_cpu(info, info->cpu, false);
 
        /* Unmask the event channel. */
        enable_irq(irq);
@@ -1801,7 +1784,7 @@ static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu)
         * it, but don't do the xenlinux-level rebind in that case.
         */
        if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
-               bind_evtchn_to_cpu(evtchn, tcpu, false);
+               bind_evtchn_to_cpu(info, tcpu, false);
 
        do_unmask(info, EVT_MASK_REASON_TEMPORARY);
 
@@ -1858,28 +1841,30 @@ static void enable_dynirq(struct irq_data *data)
                do_unmask(info, EVT_MASK_REASON_EXPLICIT);
 }
 
-static void disable_dynirq(struct irq_data *data)
+static void do_ack_dynirq(struct irq_info *info)
 {
-       struct irq_info *info = info_for_irq(data->irq);
-       evtchn_port_t evtchn = info ? info->evtchn : 0;
+       evtchn_port_t evtchn = info->evtchn;
 
        if (VALID_EVTCHN(evtchn))
-               do_mask(info, EVT_MASK_REASON_EXPLICIT);
+               event_handler_exit(info);
 }
 
 static void ack_dynirq(struct irq_data *data)
 {
        struct irq_info *info = info_for_irq(data->irq);
-       evtchn_port_t evtchn = info ? info->evtchn : 0;
 
-       if (VALID_EVTCHN(evtchn))
-               event_handler_exit(info);
+       if (info)
+               do_ack_dynirq(info);
 }
 
 static void mask_ack_dynirq(struct irq_data *data)
 {
-       disable_dynirq(data);
-       ack_dynirq(data);
+       struct irq_info *info = info_for_irq(data->irq);
+
+       if (info) {
+               do_disable_dynirq(info);
+               do_ack_dynirq(info);
+       }
 }
 
 static void lateeoi_ack_dynirq(struct irq_data *data)
@@ -1952,13 +1937,13 @@ static void restore_pirqs(void)
                if (rc) {
                        pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n",
                                gsi, irq, pirq, rc);
-                       xen_free_irq(irq);
+                       xen_free_irq(info);
                        continue;
                }
 
                printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
 
-               __startup_pirq(irq);
+               __startup_pirq(info);
        }
 }
 
@@ -1966,13 +1951,15 @@ static void restore_cpu_virqs(unsigned int cpu)
 {
        struct evtchn_bind_virq bind_virq;
        evtchn_port_t evtchn;
+       struct irq_info *info;
        int virq, irq;
 
        for (virq = 0; virq < NR_VIRQS; virq++) {
                if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
                        continue;
+               info = info_for_irq(irq);
 
-               BUG_ON(virq_from_irq(irq) != virq);
+               BUG_ON(virq_from_irq(info) != virq);
 
                /* Get a new binding from Xen. */
                bind_virq.virq = virq;
@@ -1983,9 +1970,9 @@ static void restore_cpu_virqs(unsigned int cpu)
                evtchn = bind_virq.port;
 
                /* Record the new mapping. */
-               (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq);
+               xen_irq_info_virq_setup(info, cpu, evtchn, virq);
                /* The affinity mask is still valid */
-               bind_evtchn_to_cpu(evtchn, cpu, false);
+               bind_evtchn_to_cpu(info, cpu, false);
        }
 }
 
@@ -1993,13 +1980,15 @@ static void restore_cpu_ipis(unsigned int cpu)
 {
        struct evtchn_bind_ipi bind_ipi;
        evtchn_port_t evtchn;
+       struct irq_info *info;
        int ipi, irq;
 
        for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) {
                if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
                        continue;
+               info = info_for_irq(irq);
 
-               BUG_ON(ipi_from_irq(irq) != ipi);
+               BUG_ON(ipi_from_irq(info) != ipi);
 
                /* Get a new binding from Xen. */
                bind_ipi.vcpu = xen_vcpu_nr(cpu);
@@ -2009,9 +1998,9 @@ static void restore_cpu_ipis(unsigned int cpu)
                evtchn = bind_ipi.port;
 
                /* Record the new mapping. */
-               (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi);
+               xen_irq_info_ipi_setup(info, cpu, evtchn, ipi);
                /* The affinity mask is still valid */
-               bind_evtchn_to_cpu(evtchn, cpu, false);
+               bind_evtchn_to_cpu(info, cpu, false);
        }
 }
 
@@ -2025,13 +2014,6 @@ void xen_clear_irq_pending(int irq)
                event_handler_exit(info);
 }
 EXPORT_SYMBOL(xen_clear_irq_pending);
-void xen_set_irq_pending(int irq)
-{
-       evtchn_port_t evtchn = evtchn_from_irq(irq);
-
-       if (VALID_EVTCHN(evtchn))
-               set_evtchn(evtchn);
-}
 
 bool xen_test_irq_pending(int irq)
 {
index 4d3398e..19ae316 100644 (file)
@@ -33,7 +33,6 @@ struct evtchn_ops {
 
 extern const struct evtchn_ops *evtchn_ops;
 
-int get_evtchn_to_irq(evtchn_port_t evtchn);
 void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl);
 
 unsigned int cpu_from_evtchn(evtchn_port_t evtchn);
index b3e3d1b..5086552 100644 (file)
@@ -47,6 +47,9 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
+#ifdef CONFIG_ACPI
+#include <acpi/processor.h>
+#endif
 
 /*
  * @cpu_id: Xen physical cpu logic number
@@ -400,4 +403,23 @@ bool __init xen_processor_present(uint32_t acpi_id)
 
        return online;
 }
+
+void xen_sanitize_proc_cap_bits(uint32_t *cap)
+{
+       struct xen_platform_op op = {
+               .cmd                    = XENPF_set_processor_pminfo,
+               .u.set_pminfo.id        = -1,
+               .u.set_pminfo.type      = XEN_PM_PDC,
+       };
+       u32 buf[3] = { ACPI_PDC_REVISION_ID, 1, *cap };
+       int ret;
+
+       set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
+       ret = HYPERVISOR_platform_op(&op);
+       if (ret)
+               pr_err("sanitize of _PDC buffer bits from Xen failed: %d\n",
+                      ret);
+       else
+               *cap = buf[2];
+}
 #endif
index b52e0fa..223870a 100644 (file)
@@ -21,7 +21,7 @@
 
 #include <xen/xen-front-pgdir-shbuf.h>
 
-/**
+/*
  * This structure represents the structure of a shared page
  * that contains grant references to the pages of the shared
  * buffer. This structure is common to many Xen para-virtualized
@@ -33,7 +33,7 @@ struct xen_page_directory {
        grant_ref_t gref[]; /* Variable length */
 };
 
-/**
+/*
  * Shared buffer ops which are differently implemented
  * depending on the allocation mode, e.g. if the buffer
  * is allocated by the corresponding backend or frontend.
@@ -61,7 +61,7 @@ struct xen_front_pgdir_shbuf_ops {
        int (*unmap)(struct xen_front_pgdir_shbuf *buf);
 };
 
-/**
+/*
  * Get granted reference to the very first page of the
  * page directory. Usually this is passed to the backend,
  * so it can find/fill the grant references to the buffer's
@@ -81,7 +81,7 @@ xen_front_pgdir_shbuf_get_dir_start(struct xen_front_pgdir_shbuf *buf)
 }
 EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_get_dir_start);
 
-/**
+/*
  * Map granted references of the shared buffer.
  *
  * Depending on the shared buffer mode of allocation
@@ -102,7 +102,7 @@ int xen_front_pgdir_shbuf_map(struct xen_front_pgdir_shbuf *buf)
 }
 EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_map);
 
-/**
+/*
  * Unmap granted references of the shared buffer.
  *
  * Depending on the shared buffer mode of allocation
@@ -123,7 +123,7 @@ int xen_front_pgdir_shbuf_unmap(struct xen_front_pgdir_shbuf *buf)
 }
 EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_unmap);
 
-/**
+/*
  * Free all the resources of the shared buffer.
  *
  * \param buf shared buffer which resources to be freed.
@@ -150,7 +150,7 @@ EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_free);
                                 offsetof(struct xen_page_directory, \
                                          gref)) / sizeof(grant_ref_t))
 
-/**
+/*
  * Get the number of pages the page directory consumes itself.
  *
  * \param buf shared buffer.
@@ -160,7 +160,7 @@ static int get_num_pages_dir(struct xen_front_pgdir_shbuf *buf)
        return DIV_ROUND_UP(buf->num_pages, XEN_NUM_GREFS_PER_PAGE);
 }
 
-/**
+/*
  * Calculate the number of grant references needed to share the buffer
  * and its pages when backend allocates the buffer.
  *
@@ -172,7 +172,7 @@ static void backend_calc_num_grefs(struct xen_front_pgdir_shbuf *buf)
        buf->num_grefs = get_num_pages_dir(buf);
 }
 
-/**
+/*
  * Calculate the number of grant references needed to share the buffer
  * and its pages when frontend allocates the buffer.
  *
@@ -190,7 +190,7 @@ static void guest_calc_num_grefs(struct xen_front_pgdir_shbuf *buf)
 #define xen_page_to_vaddr(page) \
        ((uintptr_t)pfn_to_kaddr(page_to_xen_pfn(page)))
 
-/**
+/*
  * Unmap the buffer previously mapped with grant references
  * provided by the backend.
  *
@@ -238,7 +238,7 @@ static int backend_unmap(struct xen_front_pgdir_shbuf *buf)
        return ret;
 }
 
-/**
+/*
  * Map the buffer with grant references provided by the backend.
  *
  * \param buf shared buffer.
@@ -320,7 +320,7 @@ static int backend_map(struct xen_front_pgdir_shbuf *buf)
        return ret;
 }
 
-/**
+/*
  * Fill page directory with grant references to the pages of the
  * page directory itself.
  *
@@ -350,7 +350,7 @@ static void backend_fill_page_dir(struct xen_front_pgdir_shbuf *buf)
        page_dir->gref_dir_next_page = XEN_GREF_LIST_END;
 }
 
-/**
+/*
  * Fill page directory with grant references to the pages of the
  * page directory and the buffer we share with the backend.
  *
@@ -389,7 +389,7 @@ static void guest_fill_page_dir(struct xen_front_pgdir_shbuf *buf)
        }
 }
 
-/**
+/*
  * Grant references to the frontend's buffer pages.
  *
  * These will be shared with the backend, so it can
@@ -418,7 +418,7 @@ static int guest_grant_refs_for_buffer(struct xen_front_pgdir_shbuf *buf,
        return 0;
 }
 
-/**
+/*
  * Grant all the references needed to share the buffer.
  *
  * Grant references to the page directory pages and, if
@@ -466,7 +466,7 @@ static int grant_references(struct xen_front_pgdir_shbuf *buf)
        return 0;
 }
 
-/**
+/*
  * Allocate all required structures to mange shared buffer.
  *
  * \param buf shared buffer.
@@ -506,7 +506,7 @@ static const struct xen_front_pgdir_shbuf_ops local_ops = {
        .grant_refs_for_buffer = guest_grant_refs_for_buffer,
 };
 
-/**
+/*
  * Allocate a new instance of a shared buffer.
  *
  * \param cfg configuration to be used while allocating a new shared buffer.
index ef02c9b..23c0834 100644 (file)
@@ -313,17 +313,17 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
                                  bp.level - 1,
                                  0);
        b = bch2_btree_iter_peek_node(iter);
-       if (IS_ERR(b))
+       if (IS_ERR_OR_NULL(b))
                goto err;
 
        BUG_ON(b->c.level != bp.level - 1);
 
-       if (b && extent_matches_bp(c, bp.btree_id, bp.level,
-                                  bkey_i_to_s_c(&b->key),
-                                  bucket, bp))
+       if (extent_matches_bp(c, bp.btree_id, bp.level,
+                             bkey_i_to_s_c(&b->key),
+                             bucket, bp))
                return b;
 
-       if (b && btree_node_will_make_reachable(b)) {
+       if (btree_node_will_make_reachable(b)) {
                b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
        } else {
                backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
index 9cb8684..403aa33 100644 (file)
@@ -617,7 +617,7 @@ struct journal_seq_blacklist_table {
                u64             start;
                u64             end;
                bool            dirty;
-       }                       entries[0];
+       }                       entries[];
 };
 
 struct journal_keys {
index c2adf3f..6fa90bc 100644 (file)
@@ -3087,8 +3087,6 @@ void bch2_trans_put(struct btree_trans *trans)
                srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
        }
 
-       bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
        kfree(trans->extra_journal_entries.data);
 
        if (trans->fs_usage_deltas) {
index 9b78f78..37fbf22 100644 (file)
@@ -89,10 +89,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
        ck->btree_trans_barrier_seq =
                start_poll_synchronize_srcu(&c->btree_trans_barrier);
 
-       if (ck->c.lock.readers)
+       if (ck->c.lock.readers) {
                list_move_tail(&ck->list, &bc->freed_pcpu);
-       else
+               bc->nr_freed_pcpu++;
+       } else {
                list_move_tail(&ck->list, &bc->freed_nonpcpu);
+               bc->nr_freed_nonpcpu++;
+       }
        atomic_long_inc(&bc->nr_freed);
 
        kfree(ck->k);
@@ -109,6 +112,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
 {
        struct bkey_cached *pos;
 
+       bc->nr_freed_nonpcpu++;
+
        list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
                if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
                                 pos->btree_trans_barrier_seq)) {
@@ -158,6 +163,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 #else
                mutex_lock(&bc->lock);
                list_move_tail(&ck->list, &bc->freed_nonpcpu);
+               bc->nr_freed_nonpcpu++;
                mutex_unlock(&bc->lock);
 #endif
        } else {
@@ -217,6 +223,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
                               f->nr < ARRAY_SIZE(f->objs) / 2) {
                                ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
                                list_del_init(&ck->list);
+                               bc->nr_freed_nonpcpu--;
                                f->objs[f->nr++] = ck;
                        }
 
@@ -229,6 +236,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
                if (!list_empty(&bc->freed_nonpcpu)) {
                        ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
                        list_del_init(&ck->list);
+                       bc->nr_freed_nonpcpu--;
                }
                mutex_unlock(&bc->lock);
 #endif
@@ -664,7 +672,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                goto out;
 
        bch2_journal_pin_drop(j, &ck->journal);
-       bch2_journal_preres_put(j, &ck->res);
 
        BUG_ON(!btree_node_locked(c_iter.path, 0));
 
@@ -762,18 +769,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 
        BUG_ON(insert->k.u64s > ck->u64s);
 
-       if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-               int difference;
-
-               BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
-
-               difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
-               if (difference > 0) {
-                       trans->journal_preres.u64s      -= difference;
-                       ck->res.u64s                    += difference;
-               }
-       }
-
        bkey_copy(ck->k, insert);
        ck->valid = true;
 
@@ -850,6 +845,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
         * Newest freed entries are at the end of the list - once we hit one
         * that's too new to be freed, we can bail out:
         */
+       scanned += bc->nr_freed_nonpcpu;
+
        list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
                if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
                                                 ck->btree_trans_barrier_seq))
@@ -859,13 +856,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
                six_lock_exit(&ck->c.lock);
                kmem_cache_free(bch2_key_cache, ck);
                atomic_long_dec(&bc->nr_freed);
-               scanned++;
                freed++;
+               bc->nr_freed_nonpcpu--;
        }
 
        if (scanned >= nr)
                goto out;
 
+       scanned += bc->nr_freed_pcpu;
+
        list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
                if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
                                                 ck->btree_trans_barrier_seq))
@@ -875,8 +874,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
                six_lock_exit(&ck->c.lock);
                kmem_cache_free(bch2_key_cache, ck);
                atomic_long_dec(&bc->nr_freed);
-               scanned++;
                freed++;
+               bc->nr_freed_pcpu--;
        }
 
        if (scanned >= nr)
@@ -982,6 +981,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
        }
 #endif
 
+       BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
+       BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
+
        list_splice(&bc->freed_pcpu,    &items);
        list_splice(&bc->freed_nonpcpu, &items);
 
@@ -991,7 +993,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
                cond_resched();
 
                bch2_journal_pin_drop(&c->journal, &ck->journal);
-               bch2_journal_preres_put(&c->journal, &ck->res);
 
                list_del(&ck->list);
                kfree(ck->k);
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
new file mode 100644 (file)
index 0000000..290e4e5
--- /dev/null
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+
+struct btree_key_cache_freelist {
+       struct bkey_cached      *objs[16];
+       unsigned                nr;
+};
+
+struct btree_key_cache {
+       struct mutex            lock;
+       struct rhashtable       table;
+       bool                    table_init_done;
+
+       struct list_head        freed_pcpu;
+       size_t                  nr_freed_pcpu;
+       struct list_head        freed_nonpcpu;
+       size_t                  nr_freed_nonpcpu;
+
+       struct shrinker         *shrink;
+       unsigned                shrink_iter;
+       struct btree_key_cache_freelist __percpu *pcpu_freed;
+
+       atomic_long_t           nr_freed;
+       atomic_long_t           nr_keys;
+       atomic_long_t           nr_dirty;
+};
+
+struct bkey_cached_key {
+       u32                     btree_id;
+       struct bpos             pos;
+} __packed __aligned(4);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
index decad7b..12907be 100644 (file)
@@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
                bch2_btree_init_next(trans, b);
 }
 
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+       while (--i >= trans->updates) {
+               if (same_leaf_as_prev(trans, i))
+                       continue;
+
+               bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+       }
+
+       trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+       return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int bch2_trans_lock_write(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i;
+
+       EBUG_ON(trans->write_locked);
+
+       trans_for_each_update(trans, i) {
+               if (same_leaf_as_prev(trans, i))
+                       continue;
+
+               if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+                       return trans_lock_write_fail(trans, i);
+
+               if (!i->cached)
+                       bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+       }
+
+       trans->write_locked = true;
+       return 0;
+}
+
+static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+       if (likely(trans->write_locked)) {
+               struct btree_insert_entry *i;
+
+               trans_for_each_update(trans, i)
+                       if (!same_leaf_as_prev(trans, i))
+                               bch2_btree_node_unlock_write_inlined(trans, i->path,
+                                                                    insert_l(i)->b);
+               trans->write_locked = false;
+       }
+}
+
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
@@ -276,17 +323,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
                bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
 }
 
-static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
-                                  unsigned long trace_ip)
-{
-       return drop_locks_do(trans,
-               bch2_journal_preres_get(&trans->c->journal,
-                       &trans->journal_preres,
-                       trans->journal_preres_u64s,
-                       (flags & BCH_WATERMARK_MASK)));
-}
-
 static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
                                                      unsigned flags)
 {
@@ -321,6 +357,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
        return 0;
 }
 
+noinline static int
+btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
+                                    struct btree_path *path, unsigned new_u64s)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i;
+       struct bkey_cached *ck = (void *) path->l[0].b;
+       struct bkey_i *new_k;
+       int ret;
+
+       bch2_trans_unlock_write(trans);
+       bch2_trans_unlock(trans);
+
+       new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+       if (!new_k) {
+               bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+                       bch2_btree_id_str(path->btree_id), new_u64s);
+               return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+       }
+
+       ret =   bch2_trans_relock(trans) ?:
+               bch2_trans_lock_write(trans);
+       if (unlikely(ret)) {
+               kfree(new_k);
+               return ret;
+       }
+
+       memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
+
+       trans_for_each_update(trans, i)
+               if (i->old_v == &ck->k->v)
+                       i->old_v = &new_k->v;
+
+       kfree(ck->k);
+       ck->u64s        = new_u64s;
+       ck->k           = new_k;
+       return 0;
+}
+
 static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
                                       struct btree_path *path, unsigned u64s)
 {
@@ -347,12 +422,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
                return 0;
 
        new_u64s        = roundup_pow_of_two(u64s);
-       new_k           = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
-       if (!new_k) {
-               bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
-                       bch2_btree_id_str(path->btree_id), new_u64s);
-               return -BCH_ERR_ENOMEM_btree_key_cache_insert;
-       }
+       new_k           = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+       if (unlikely(!new_k))
+               return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
 
        trans_for_each_update(trans, i)
                if (i->old_v == &ck->k->v)
@@ -732,37 +804,6 @@ revert_fs_usage:
        return ret;
 }
 
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-       while (--i >= trans->updates) {
-               if (same_leaf_as_prev(trans, i))
-                       continue;
-
-               bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
-       }
-
-       trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
-       return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int trans_lock_write(struct btree_trans *trans)
-{
-       struct btree_insert_entry *i;
-
-       trans_for_each_update(trans, i) {
-               if (same_leaf_as_prev(trans, i))
-                       continue;
-
-               if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
-                       return trans_lock_write_fail(trans, i);
-
-               if (!i->cached)
-                       bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
-       }
-
-       return 0;
-}
-
 static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
 {
        struct btree_insert_entry *i;
@@ -830,15 +871,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
                }
        }
 
-       ret = bch2_journal_preres_get(&c->journal,
-                       &trans->journal_preres, trans->journal_preres_u64s,
-                       (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
-       if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
-               ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
-       if (unlikely(ret))
-               return ret;
-
-       ret = trans_lock_write(trans);
+       ret = bch2_trans_lock_write(trans);
        if (unlikely(ret))
                return ret;
 
@@ -847,10 +880,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
        if (!ret && unlikely(trans->journal_replay_not_finished))
                bch2_drop_overwrites_from_journal(trans);
 
-       trans_for_each_update(trans, i)
-               if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_unlock_write_inlined(trans, i->path,
-                                                       insert_l(i)->b);
+       bch2_trans_unlock_write(trans);
 
        if (!ret && trans->journal_pin)
                bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
@@ -1003,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i = NULL;
        struct btree_write_buffered_key *wb;
-       unsigned u64s;
        int ret = 0;
 
        if (!trans->nr_updates &&
@@ -1063,13 +1092,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 
        EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
 
-       memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
        trans->journal_u64s             = trans->extra_journal_entries.nr;
-       trans->journal_preres_u64s      = 0;
-
        trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-
        if (trans->journal_transaction_names)
                trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
 
@@ -1085,16 +1109,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
                if (i->key_cache_already_flushed)
                        continue;
 
-               /* we're going to journal the key being updated: */
-               u64s = jset_u64s(i->k->k.u64s);
-               if (i->cached &&
-                   likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
-                       trans->journal_preres_u64s += u64s;
-
                if (i->flags & BTREE_UPDATE_NOJOURNAL)
                        continue;
 
-               trans->journal_u64s += u64s;
+               /* we're going to journal the key being updated: */
+               trans->journal_u64s += jset_u64s(i->k->k.u64s);
 
                /* and we're also going to log the overwrite: */
                if (trans->journal_transaction_names)
@@ -1126,8 +1145,6 @@ retry:
 
        trace_and_count(c, transaction_commit, trans, _RET_IP_);
 out:
-       bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
        if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
                bch2_write_ref_put(c, BCH_WRITE_REF_trans);
 out_reset:
index 941841a..60453ba 100644 (file)
@@ -5,7 +5,7 @@
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
-//#include "bkey_methods.h"
+#include "btree_key_cache_types.h"
 #include "buckets_types.h"
 #include "darray.h"
 #include "errcode.h"
@@ -312,31 +312,6 @@ struct btree_iter {
 #endif
 };
 
-struct btree_key_cache_freelist {
-       struct bkey_cached      *objs[16];
-       unsigned                nr;
-};
-
-struct btree_key_cache {
-       struct mutex            lock;
-       struct rhashtable       table;
-       bool                    table_init_done;
-       struct list_head        freed_pcpu;
-       struct list_head        freed_nonpcpu;
-       struct shrinker         *shrink;
-       unsigned                shrink_iter;
-       struct btree_key_cache_freelist __percpu *pcpu_freed;
-
-       atomic_long_t           nr_freed;
-       atomic_long_t           nr_keys;
-       atomic_long_t           nr_dirty;
-};
-
-struct bkey_cached_key {
-       u32                     btree_id;
-       struct bpos             pos;
-} __packed __aligned(4);
-
 #define BKEY_CACHED_ACCESSED           0
 #define BKEY_CACHED_DIRTY              1
 
@@ -352,7 +327,6 @@ struct bkey_cached {
        struct rhash_head       hash;
        struct list_head        list;
 
-       struct journal_preres   res;
        struct journal_entry_pin journal;
        u64                     seq;
 
@@ -389,11 +363,7 @@ struct btree_insert_entry {
        unsigned long           ip_allocated;
 };
 
-#ifndef CONFIG_LOCKDEP
 #define BTREE_ITER_MAX         64
-#else
-#define BTREE_ITER_MAX         32
-#endif
 
 struct btree_trans_commit_hook;
 typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
@@ -434,6 +404,7 @@ struct btree_trans {
        bool                    journal_transaction_names:1;
        bool                    journal_replay_not_finished:1;
        bool                    notrace_relock_fail:1;
+       bool                    write_locked:1;
        enum bch_errcode        restarted:16;
        u32                     restart_count;
        unsigned long           last_begin_ip;
@@ -465,11 +436,9 @@ struct btree_trans {
        struct journal_entry_pin *journal_pin;
 
        struct journal_res      journal_res;
-       struct journal_preres   journal_preres;
        u64                     *journal_seq;
        struct disk_reservation *disk_res;
        unsigned                journal_u64s;
-       unsigned                journal_preres_u64s;
        struct replicas_delta_list *fs_usage_deltas;
 };
 
index 39c2db6..76f27bc 100644 (file)
@@ -513,8 +513,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
                up_read(&c->gc_lock);
        as->took_gc_lock = false;
 
-       bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
        bch2_journal_pin_drop(&c->journal, &as->journal);
        bch2_journal_pin_flush(&c->journal, &as->journal);
        bch2_disk_reservation_put(c, &as->disk_res);
@@ -734,8 +732,6 @@ err:
 
        bch2_journal_pin_drop(&c->journal, &as->journal);
 
-       bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
        mutex_lock(&c->btree_interior_update_lock);
        for (i = 0; i < as->nr_new_nodes; i++) {
                b = as->new_nodes[i];
@@ -1047,7 +1043,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        unsigned nr_nodes[2] = { 0, 0 };
        unsigned update_level = level;
        enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-       unsigned journal_flags = 0;
        int ret = 0;
        u32 restart_count = trans->restart_count;
 
@@ -1061,10 +1056,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        flags &= ~BCH_WATERMARK_MASK;
        flags |= watermark;
 
-       if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
-               journal_flags |= JOURNAL_RES_GET_NONBLOCK;
-       journal_flags |= watermark;
-
        while (1) {
                nr_nodes[!!update_level] += 1 + split;
                update_level++;
@@ -1129,27 +1120,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        if (ret)
                goto err;
 
-       ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-                                     BTREE_UPDATE_JOURNAL_RES,
-                                     journal_flags|JOURNAL_RES_GET_NONBLOCK);
-       if (ret) {
-               if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
-                       ret = -BCH_ERR_journal_reclaim_would_deadlock;
-                       goto err;
-               }
-
-               ret = drop_locks_do(trans,
-                       bch2_journal_preres_get(&c->journal, &as->journal_preres,
-                                             BTREE_UPDATE_JOURNAL_RES,
-                                             journal_flags));
-               if (ret == -BCH_ERR_journal_preres_get_blocked) {
-                       trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
-                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
-               }
-               if (ret)
-                       goto err;
-       }
-
        ret = bch2_disk_reservation_get(c, &as->disk_res,
                        (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
                        c->opts.metadata_replicas,
index 4df2151..031076e 100644 (file)
@@ -55,7 +55,6 @@ struct btree_update {
        unsigned                        update_level;
 
        struct disk_reservation         disk_res;
-       struct journal_preres           journal_preres;
 
        /*
         * BTREE_INTERIOR_UPDATING_NODE:
index 0771a6d..5ed6620 100644 (file)
@@ -239,6 +239,34 @@ restart_drop_extra_replicas:
 
                next_pos = insert->k.p;
 
+               /*
+                * Check for nonce offset inconsistency:
+                * This is debug code - we've been seeing this bug rarely, and
+                * it's been hard to reproduce, so this should give us some more
+                * information when it does occur:
+                */
+               struct printbuf err = PRINTBUF;
+               int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
+               printbuf_exit(&err);
+
+               if (invalid) {
+                       struct printbuf buf = PRINTBUF;
+
+                       prt_str(&buf, "about to insert invalid key in data update path");
+                       prt_str(&buf, "\nold: ");
+                       bch2_bkey_val_to_text(&buf, c, old);
+                       prt_str(&buf, "\nk:   ");
+                       bch2_bkey_val_to_text(&buf, c, k);
+                       prt_str(&buf, "\nnew: ");
+                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+                       bch2_print_string_as_lines(KERN_ERR, buf.buf);
+                       printbuf_exit(&buf);
+
+                       bch2_fatal_error(c);
+                       goto out;
+               }
+
                ret =   bch2_insert_snapshot_whiteouts(trans, m->btree_id,
                                                k.k->p, bkey_start_pos(&insert->k)) ?:
                        bch2_insert_snapshot_whiteouts(trans, m->btree_id,
index d613695..4d0cb0c 100644 (file)
@@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
        case TARGET_DEV: {
                struct bch_dev *ca;
 
+               out->atomic++;
                rcu_read_lock();
                ca = t.dev < c->sb.nr_devices
                        ? rcu_dereference(c->devs[t.dev])
@@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
                }
 
                rcu_read_unlock();
+               out->atomic--;
                break;
        }
        case TARGET_GROUP:
@@ -580,7 +582,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
        }
 }
 
-void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
 {
        struct target t = target_decode(v);
 
index 875f7c5..2a77de1 100644 (file)
@@ -1373,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
                        h->nr_active_devs++;
 
        rcu_read_unlock();
+
+       /*
+        * If we only have redundancy + 1 devices, we're better off with just
+        * replication:
+        */
+       if (h->nr_active_devs < h->redundancy + 2)
+               bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
+                       h->nr_active_devs, h->redundancy + 2);
+
        list_add(&h->list, &c->ec_stripe_head_list);
        return h;
 }
@@ -1424,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
 
        h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
 found:
+       if (!IS_ERR_OR_NULL(h) &&
+           h->nr_active_devs < h->redundancy + 2) {
+               mutex_unlock(&h->lock);
+               h = NULL;
+       }
        mutex_unlock(&c->ec_stripe_head_lock);
        return h;
 }
@@ -1681,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
        int ret;
 
        h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
-       if (!h)
-               bch_err(c, "no stripe head");
        if (IS_ERR_OR_NULL(h))
                return h;
 
index 8bd9bcd..ff664fd 100644 (file)
@@ -13,7 +13,7 @@
 
 int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
                                     loff_t start, u64 end,
-                                    int fgp_flags, gfp_t gfp,
+                                    fgf_t fgp_flags, gfp_t gfp,
                                     folios *fs)
 {
        struct folio *f;
index a2222ad..27f712a 100644 (file)
@@ -7,7 +7,7 @@
 typedef DARRAY(struct folio *) folios;
 
 int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
-                                    u64, int, gfp_t, folios *);
+                                    u64, fgf_t, gfp_t, folios *);
 int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
 
 /*
index 166d8d8..8ef8173 100644 (file)
@@ -1922,10 +1922,7 @@ out:
        return dget(sb->s_root);
 
 err_put_super:
-       sb->s_fs_info = NULL;
-       c->vfs_sb = NULL;
        deactivate_locked_super(sb);
-       bch2_fs_stop(c);
        return ERR_PTR(bch2_err_class(ret));
 }
 
@@ -1933,11 +1930,8 @@ static void bch2_kill_sb(struct super_block *sb)
 {
        struct bch_fs *c = sb->s_fs_info;
 
-       if (c)
-               c->vfs_sb = NULL;
        generic_shutdown_super(sb);
-       if (c)
-               bch2_fs_free(c);
+       bch2_fs_free(c);
 }
 
 static struct file_system_type bcache_fs_type = {
index 9f3e9bd..e0c5cd1 100644 (file)
@@ -2220,7 +2220,7 @@ static int nlink_cmp(const void *_l, const void *_r)
        const struct nlink *l = _l;
        const struct nlink *r = _r;
 
-       return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
+       return cmp_int(l->inum, r->inum);
 }
 
 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
index def77f2..c7849b0 100644 (file)
@@ -1134,7 +1134,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
                 * unlinked inodes in the snapshot leaves:
                 */
                *need_another_pass = true;
-               return 0;
+               goto out;
        }
 
        ret = 1;
@@ -1169,8 +1169,10 @@ again:
         */
        for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
                           BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-               ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p,
-                                                                    &need_another_pass));
+               ret = commit_do(trans, NULL, NULL,
+                               BTREE_INSERT_NOFAIL|
+                               BTREE_INSERT_LAZY_RW,
+                       may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
                if (ret < 0)
                        break;
 
index f02b3f7..d704a8f 100644 (file)
@@ -795,7 +795,7 @@ static int bch2_write_decrypt(struct bch_write_op *op)
         * checksum:
         */
        csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
-       if (bch2_crc_cmp(op->crc.csum, csum))
+       if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
                return -EIO;
 
        ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
index 5b5d69f..23a9b78 100644 (file)
@@ -526,36 +526,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
        return ret;
 }
 
-/* journal_preres: */
-
-static bool journal_preres_available(struct journal *j,
-                                    struct journal_preres *res,
-                                    unsigned new_u64s,
-                                    unsigned flags)
-{
-       bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
-
-       if (!ret && mutex_trylock(&j->reclaim_lock)) {
-               bch2_journal_reclaim(j);
-               mutex_unlock(&j->reclaim_lock);
-       }
-
-       return ret;
-}
-
-int __bch2_journal_preres_get(struct journal *j,
-                             struct journal_preres *res,
-                             unsigned new_u64s,
-                             unsigned flags)
-{
-       int ret;
-
-       closure_wait_event(&j->preres_wait,
-                  (ret = bch2_journal_error(j)) ||
-                  journal_preres_available(j, res, new_u64s, flags));
-       return ret;
-}
-
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *j,
@@ -1306,7 +1276,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        prt_printf(out, "last_seq:\t\t%llu\n",          journal_last_seq(j));
        prt_printf(out, "last_seq_ondisk:\t%llu\n",             j->last_seq_ondisk);
        prt_printf(out, "flushed_seq_ondisk:\t%llu\n",  j->flushed_seq_ondisk);
-       prt_printf(out, "prereserved:\t\t%u/%u\n",              j->prereserved.reserved, j->prereserved.remaining);
        prt_printf(out, "watermark:\t\t%s\n",           bch2_watermarks[j->watermark]);
        prt_printf(out, "each entry reserved:\t%u\n",   j->entry_u64s_reserved);
        prt_printf(out, "nr flush writes:\t%llu\n",             j->nr_flush_writes);
index 011711e..c85d01c 100644 (file)
@@ -395,104 +395,6 @@ out:
        return 0;
 }
 
-/* journal_preres: */
-
-static inline void journal_set_watermark(struct journal *j)
-{
-       union journal_preres_state s = READ_ONCE(j->prereserved);
-       unsigned watermark = BCH_WATERMARK_stripe;
-
-       if (fifo_free(&j->pin) < j->pin.size / 4)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
-       if (fifo_free(&j->pin) < j->pin.size / 8)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
-       if (s.reserved > s.remaining)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
-       if (!s.remaining)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
-       if (watermark == j->watermark)
-               return;
-
-       swap(watermark, j->watermark);
-       if (watermark > j->watermark)
-               journal_wake(j);
-}
-
-static inline void bch2_journal_preres_put(struct journal *j,
-                                          struct journal_preres *res)
-{
-       union journal_preres_state s = { .reserved = res->u64s };
-
-       if (!res->u64s)
-               return;
-
-       s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
-       res->u64s = 0;
-
-       if (unlikely(s.waiting)) {
-               clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
-                         (unsigned long *) &j->prereserved.v);
-               closure_wake_up(&j->preres_wait);
-       }
-
-       if (s.reserved <= s.remaining && j->watermark)
-               journal_set_watermark(j);
-}
-
-int __bch2_journal_preres_get(struct journal *,
-                       struct journal_preres *, unsigned, unsigned);
-
-static inline int bch2_journal_preres_get_fast(struct journal *j,
-                                              struct journal_preres *res,
-                                              unsigned new_u64s,
-                                              unsigned flags,
-                                              bool set_waiting)
-{
-       int d = new_u64s - res->u64s;
-       union journal_preres_state old, new;
-       u64 v = atomic64_read(&j->prereserved.counter);
-       enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-       int ret;
-
-       do {
-               old.v = new.v = v;
-               ret = 0;
-
-               if (watermark == BCH_WATERMARK_reclaim ||
-                   new.reserved + d < new.remaining) {
-                       new.reserved += d;
-                       ret = 1;
-               } else if (set_waiting && !new.waiting)
-                       new.waiting = true;
-               else
-                       return 0;
-       } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
-                                      old.v, new.v)) != old.v);
-
-       if (ret)
-               res->u64s += d;
-       return ret;
-}
-
-static inline int bch2_journal_preres_get(struct journal *j,
-                                         struct journal_preres *res,
-                                         unsigned new_u64s,
-                                         unsigned flags)
-{
-       if (new_u64s <= res->u64s)
-               return 0;
-
-       if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
-               return 0;
-
-       if (flags & JOURNAL_RES_GET_NONBLOCK)
-               return -BCH_ERR_journal_preres_get_blocked;
-
-       return __bch2_journal_preres_get(j, res, new_u64s, flags);
-}
-
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *,
index f4bc2cd..786a092 100644 (file)
@@ -1079,6 +1079,12 @@ found:
 
        if (ja->bucket_seq[ja->cur_idx] &&
            ja->sectors_free == ca->mi.bucket_size) {
+#if 0
+               /*
+                * Debug code for ZNS support, where we (probably) want to be
+                * correlated where we stopped in the journal to the zone write
+                * points:
+                */
                bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
                bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
                for (i = 0; i < 3; i++) {
@@ -1086,6 +1092,7 @@ found:
 
                        bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
                }
+#endif
                ja->sectors_free = 0;
        }
 
index 9a584aa..e63c6ed 100644 (file)
@@ -50,16 +50,21 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
        return available;
 }
 
-static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+static inline void journal_set_watermark(struct journal *j, bool low_on_space)
 {
-       union journal_preres_state old, new;
-       u64 v = atomic64_read(&j->prereserved.counter);
+       unsigned watermark = BCH_WATERMARK_stripe;
 
-       do {
-               old.v = new.v = v;
-               new.remaining = u64s_remaining;
-       } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
-                                      old.v, new.v)) != old.v);
+       if (low_on_space)
+               watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+       if (fifo_free(&j->pin) < j->pin.size / 4)
+               watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
+
+       if (watermark == j->watermark)
+               return;
+
+       swap(watermark, j->watermark);
+       if (watermark > j->watermark)
+               journal_wake(j);
 }
 
 static struct journal_space
@@ -162,7 +167,6 @@ void bch2_journal_space_available(struct journal *j)
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
        unsigned clean, clean_ondisk, total;
-       s64 u64s_remaining = 0;
        unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
                                       j->buf[1].buf_size >> 9);
        unsigned i, nr_online = 0, nr_devs_want;
@@ -222,16 +226,10 @@ void bch2_journal_space_available(struct journal *j)
        else
                clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
 
-       u64s_remaining  = (u64) clean << 6;
-       u64s_remaining -= (u64) total << 3;
-       u64s_remaining = max(0LL, u64s_remaining);
-       u64s_remaining /= 4;
-       u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
+       journal_set_watermark(j, clean * 4 <= total);
 out:
        j->cur_entry_sectors    = !ret ? j->space[journal_space_discarded].next_entry : 0;
        j->cur_entry_error      = ret;
-       journal_set_remaining(j, u64s_remaining);
-       journal_set_watermark(j);
 
        if (!ret)
                journal_wake(j);
@@ -555,11 +553,6 @@ static u64 journal_seq_to_flush(struct journal *j)
                /* Try to keep the journal at most half full: */
                nr_buckets = ja->nr / 2;
 
-               /* And include pre-reservations: */
-               nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
-                                          (ca->mi.bucket_size << 6) -
-                                          journal_entry_overhead(j));
-
                nr_buckets = min(nr_buckets, ja->nr);
 
                bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
@@ -638,10 +631,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
                               msecs_to_jiffies(c->opts.journal_reclaim_delay)))
                        min_nr = 1;
 
-               if (j->prereserved.reserved * 4 > j->prereserved.remaining)
-                       min_nr = 1;
-
-               if (fifo_free(&j->pin) <= 32)
+               if (j->watermark != BCH_WATERMARK_stripe)
                        min_nr = 1;
 
                if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
@@ -652,8 +642,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
                trace_and_count(c, journal_reclaim_start, c,
                                direct, kicked,
                                min_nr, min_key_cache,
-                               j->prereserved.reserved,
-                               j->prereserved.remaining,
                                atomic_read(&c->btree_cache.dirty),
                                c->btree_cache.used,
                                atomic_long_read(&c->btree_key_cache.nr_dirty),
index 42504e1..a756b69 100644 (file)
@@ -76,14 +76,6 @@ struct journal_res {
        u64                     seq;
 };
 
-/*
- * For reserving space in the journal prior to getting a reservation on a
- * particular journal entry:
- */
-struct journal_preres {
-       unsigned                u64s;
-};
-
 union journal_res_state {
        struct {
                atomic64_t      counter;
@@ -104,22 +96,6 @@ union journal_res_state {
        };
 };
 
-union journal_preres_state {
-       struct {
-               atomic64_t      counter;
-       };
-
-       struct {
-               u64             v;
-       };
-
-       struct {
-               u64             waiting:1,
-                               reserved:31,
-                               remaining:32;
-       };
-};
-
 /* bytes: */
 #define JOURNAL_ENTRY_SIZE_MIN         (64U << 10) /* 64k */
 #define JOURNAL_ENTRY_SIZE_MAX         (4U  << 20) /* 4M */
@@ -180,8 +156,6 @@ struct journal {
        union journal_res_state reservations;
        enum bch_watermark      watermark;
 
-       union journal_preres_state prereserved;
-
        } __aligned(SMP_CACHE_BYTES);
 
        unsigned long           flags;
index b775cf0..9779044 100644 (file)
@@ -163,8 +163,11 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
                this_cpu_sub(*lock->readers, !ret);
                preempt_enable();
 
-               if (!ret && (old & SIX_LOCK_WAITING_write))
-                       ret = -1 - SIX_LOCK_write;
+               if (!ret) {
+                       smp_mb();
+                       if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
+                               ret = -1 - SIX_LOCK_write;
+               }
        } else if (type == SIX_LOCK_write && lock->readers) {
                if (try) {
                        atomic_add(SIX_LOCK_HELD_write, &lock->state);
index 8683344..2d2e66a 100644 (file)
@@ -20,7 +20,7 @@ struct snapshot_t {
 };
 
 struct snapshot_table {
-       struct snapshot_t       s[0];
+       DECLARE_FLEX_ARRAY(struct snapshot_t, s);
 };
 
 typedef struct {
index 893304a..7857671 100644 (file)
@@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write,
 TRACE_EVENT(journal_reclaim_start,
        TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
                 u64 min_nr, u64 min_key_cache,
-                u64 prereserved, u64 prereserved_total,
                 u64 btree_cache_dirty, u64 btree_cache_total,
                 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
-       TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
+       TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
                btree_cache_dirty, btree_cache_total,
                btree_key_cache_dirty, btree_key_cache_total),
 
@@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start,
                __field(bool,           kicked                  )
                __field(u64,            min_nr                  )
                __field(u64,            min_key_cache           )
-               __field(u64,            prereserved             )
-               __field(u64,            prereserved_total       )
                __field(u64,            btree_cache_dirty       )
                __field(u64,            btree_cache_total       )
                __field(u64,            btree_key_cache_dirty   )
@@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start,
                __entry->kicked                 = kicked;
                __entry->min_nr                 = min_nr;
                __entry->min_key_cache          = min_key_cache;
-               __entry->prereserved            = prereserved;
-               __entry->prereserved_total      = prereserved_total;
                __entry->btree_cache_dirty      = btree_cache_dirty;
                __entry->btree_cache_total      = btree_cache_total;
                __entry->btree_key_cache_dirty  = btree_key_cache_dirty;
                __entry->btree_key_cache_total  = btree_key_cache_total;
        ),
 
-       TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+       TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->direct,
                  __entry->kicked,
                  __entry->min_nr,
                  __entry->min_key_cache,
-                 __entry->prereserved,
-                 __entry->prereserved_total,
                  __entry->btree_cache_dirty,
                  __entry->btree_cache_total,
                  __entry->btree_key_cache_dirty,
index a39ff0c..79d9826 100644 (file)
@@ -552,6 +552,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
                s.v = v + 1;
                s.defined = true;
        } else {
+               /*
+                * Check if this option was set on the parent - if so, switched
+                * back to inheriting from the parent:
+                *
+                * rename() also has to deal with keeping inherited options up
+                * to date - see bch2_reinherit_attrs()
+                */
+               spin_lock(&dentry->d_lock);
                if (!IS_ROOT(dentry)) {
                        struct bch_inode_info *dir =
                                to_bch_ei(d_inode(dentry->d_parent));
@@ -560,6 +568,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
                } else {
                        s.v = 0;
                }
+               spin_unlock(&dentry->d_lock);
 
                s.defined = false;
        }
index 2a9344a..35c1d24 100644 (file)
@@ -432,7 +432,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
        if (btrfs_block_can_be_shared(trans, root, buf)) {
                ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
                                               btrfs_header_level(buf), 1,
-                                              &refs, &flags);
+                                              &refs, &flags, NULL);
                if (ret)
                        return ret;
                if (unlikely(refs == 0)) {
index 9223934..891ea2f 100644 (file)
@@ -1041,7 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        }
 
-       if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
+       if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
                record = kzalloc(sizeof(*record), GFP_NOFS);
                if (!record) {
                        kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
@@ -1144,7 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        }
 
-       if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) {
+       if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
                record = kzalloc(sizeof(*record), GFP_NOFS);
                if (!record) {
                        kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
index c8e5b47..0455935 100644 (file)
@@ -102,7 +102,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
  */
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info, u64 bytenr,
-                            u64 offset, int metadata, u64 *refs, u64 *flags)
+                            u64 offset, int metadata, u64 *refs, u64 *flags,
+                            u64 *owning_root)
 {
        struct btrfs_root *extent_root;
        struct btrfs_delayed_ref_head *head;
@@ -114,6 +115,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
        u32 item_size;
        u64 num_refs;
        u64 extent_flags;
+       u64 owner = 0;
        int ret;
 
        /*
@@ -167,6 +169,8 @@ search_again:
                                            struct btrfs_extent_item);
                        num_refs = btrfs_extent_refs(leaf, ei);
                        extent_flags = btrfs_extent_flags(leaf, ei);
+                       owner = btrfs_get_extent_owner_root(fs_info, leaf,
+                                                           path->slots[0]);
                } else {
                        ret = -EUCLEAN;
                        btrfs_err(fs_info,
@@ -226,6 +230,8 @@ out:
                *refs = num_refs;
        if (flags)
                *flags = extent_flags;
+       if (owning_root)
+               *owning_root = owner;
 out_free:
        btrfs_free_path(path);
        return ret;
@@ -5234,7 +5240,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
                /* We don't lock the tree block, it's OK to be racy here */
                ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
                                               wc->level - 1, 1, &refs,
-                                              &flags);
+                                              &flags, NULL);
                /* We don't care about errors in readahead. */
                if (ret < 0)
                        continue;
@@ -5301,7 +5307,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                ret = btrfs_lookup_extent_info(trans, fs_info,
                                               eb->start, level, 1,
                                               &wc->refs[level],
-                                              &wc->flags[level]);
+                                              &wc->flags[level],
+                                              NULL);
                BUG_ON(ret == -ENOMEM);
                if (ret)
                        return ret;
@@ -5391,6 +5398,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        u64 bytenr;
        u64 generation;
        u64 parent;
+       u64 owner_root = 0;
        struct btrfs_tree_parent_check check = { 0 };
        struct btrfs_key key;
        struct btrfs_ref ref = { 0 };
@@ -5434,7 +5442,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 
        ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
                                       &wc->refs[level - 1],
-                                      &wc->flags[level - 1]);
+                                      &wc->flags[level - 1],
+                                      &owner_root);
        if (ret < 0)
                goto out_unlock;
 
@@ -5567,8 +5576,7 @@ skip:
                find_next_key(path, level, &wc->drop_progress);
 
                btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
-                                      fs_info->nodesize, parent,
-                                      btrfs_header_owner(next));
+                                      fs_info->nodesize, parent, owner_root);
                btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
                                    0, false);
                ret = btrfs_free_extent(trans, &ref);
@@ -5635,7 +5643,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                        ret = btrfs_lookup_extent_info(trans, fs_info,
                                                       eb->start, level, 1,
                                                       &wc->refs[level],
-                                                      &wc->flags[level]);
+                                                      &wc->flags[level],
+                                                      NULL);
                        if (ret < 0) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
                                path->locks[level] = 0;
@@ -5880,7 +5889,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
                        ret = btrfs_lookup_extent_info(trans, fs_info,
                                                path->nodes[level]->start,
                                                level, 1, &wc->refs[level],
-                                               &wc->flags[level]);
+                                               &wc->flags[level], NULL);
                        if (ret < 0) {
                                err = ret;
                                goto out_end_trans;
index 0716f65..2e06603 100644 (file)
@@ -99,7 +99,8 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info, u64 bytenr,
-                            u64 offset, int metadata, u64 *refs, u64 *flags);
+                            u64 offset, int metadata, u64 *refs, u64 *flags,
+                            u64 *owner_root);
 int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
                     int reserved);
 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
index 5e3fccd..9f5a989 100644 (file)
@@ -6983,8 +6983,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
        int ret;
 
        alloc_hint = get_extent_allocation_hint(inode, start, len);
+again:
        ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
                                   0, alloc_hint, &ins, 1, 1);
+       if (ret == -EAGAIN) {
+               ASSERT(btrfs_is_zoned(fs_info));
+               wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+                              TASK_UNINTERRUPTIBLE);
+               goto again;
+       }
        if (ret)
                return ERR_PTR(ret);
 
index 752acff..dfe257e 100644 (file)
@@ -1528,7 +1528,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
 static noinline int copy_to_sk(struct btrfs_path *path,
                               struct btrfs_key *key,
                               struct btrfs_ioctl_search_key *sk,
-                              size_t *buf_size,
+                              u64 *buf_size,
                               char __user *ubuf,
                               unsigned long *sk_offset,
                               int *num_found)
@@ -1660,7 +1660,7 @@ out:
 
 static noinline int search_ioctl(struct inode *inode,
                                 struct btrfs_ioctl_search_key *sk,
-                                size_t *buf_size,
+                                u64 *buf_size,
                                 char __user *ubuf)
 {
        struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
@@ -1733,7 +1733,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
        struct btrfs_ioctl_search_args __user *uargs = argp;
        struct btrfs_ioctl_search_key sk;
        int ret;
-       size_t buf_size;
+       u64 buf_size;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -1763,8 +1763,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
        struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
        struct btrfs_ioctl_search_args_v2 args;
        int ret;
-       size_t buf_size;
-       const size_t buf_limit = SZ_16M;
+       u64 buf_size;
+       const u64 buf_limit = SZ_16M;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
index edb84cc..ce446d9 100644 (file)
@@ -1888,7 +1888,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
        u64 bytenr = record->bytenr;
 
        if (!btrfs_qgroup_full_accounting(fs_info))
-               return 0;
+               return 1;
 
        lockdep_assert_held(&delayed_refs->lock);
        trace_btrfs_qgroup_trace_extent(fs_info, record);
@@ -2874,13 +2874,19 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
        qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots,
                               num_bytes, seq);
 
+       /*
+        * We're done using the iterator, release all its qgroups while holding
+        * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
+        * and trigger use-after-free accesses to qgroups.
+        */
+       qgroup_iterator_nested_clean(&qgroups);
+
        /*
         * Bump qgroup_seq to avoid seq overlap
         */
        fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
        spin_unlock(&fs_info->qgroup_lock);
 out_free:
-       qgroup_iterator_nested_clean(&qgroups);
        ulist_free(old_roots);
        ulist_free(new_roots);
        return ret;
index 944e8f1..9589362 100644 (file)
@@ -145,7 +145,7 @@ int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
                btrfs_put_bioc(bioc);
        }
 
-       return ret;
+       return 0;
 }
 
 int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
index 9ce5be2..f62a408 100644 (file)
@@ -1868,6 +1868,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
         */
        ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);
 
+       /* @found_logical_ret must be specified. */
+       ASSERT(found_logical_ret);
+
        stripe = &sctx->stripes[sctx->cur_stripe];
        scrub_reset_stripe(stripe);
        ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
@@ -1876,8 +1879,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
        /* Either >0 as no more extents or <0 for error. */
        if (ret)
                return ret;
-       if (found_logical_ret)
-               *found_logical_ret = stripe->logical;
+       *found_logical_ret = stripe->logical;
        sctx->cur_stripe++;
 
        /* We filled one group, submit it. */
@@ -2080,7 +2082,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 
        /* Go through each extent items inside the logical range */
        while (cur_logical < logical_end) {
-               u64 found_logical;
+               u64 found_logical = U64_MAX;
                u64 cur_physical = physical + cur_logical - logical_start;
 
                /* Canceled? */
@@ -2115,6 +2117,8 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
                if (ret < 0)
                        break;
 
+               /* queue_scrub_stripe() returned 0, @found_logical must be updated. */
+               ASSERT(found_logical != U64_MAX);
                cur_logical = found_logical + BTRFS_STRIPE_LEN;
 
                /* Don't hold CPU for too long time */
index c87e188..c6f1662 100644 (file)
@@ -748,13 +748,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 
        if (!fs_devices) {
                fs_devices = alloc_fs_devices(disk_super->fsid);
+               if (IS_ERR(fs_devices))
+                       return ERR_CAST(fs_devices);
+
                if (has_metadata_uuid)
                        memcpy(fs_devices->metadata_uuid,
                               disk_super->metadata_uuid, BTRFS_FSID_SIZE);
 
-               if (IS_ERR(fs_devices))
-                       return ERR_CAST(fs_devices);
-
                if (same_fsid_diff_dev) {
                        generate_random_uuid(fs_devices->fsid);
                        fs_devices->temp_fsid = true;
index 3504ade..188378c 100644 (file)
@@ -1661,13 +1661,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
        }
 
 out:
-       if (cache->alloc_offset > fs_info->zone_size) {
-               btrfs_err(fs_info,
-                       "zoned: invalid write pointer %llu in block group %llu",
-                       cache->alloc_offset, cache->start);
-               ret = -EIO;
-       }
-
        if (cache->alloc_offset > cache->zone_capacity) {
                btrfs_err(fs_info,
 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
index 929248c..4cbe043 100644 (file)
@@ -84,8 +84,8 @@ int   nfsd_net_reply_cache_init(struct nfsd_net *nn);
 void   nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
 int    nfsd_reply_cache_init(struct nfsd_net *);
 void   nfsd_reply_cache_shutdown(struct nfsd_net *);
-int    nfsd_cache_lookup(struct svc_rqst *rqstp,
-                         struct nfsd_cacherep **cacherep);
+int    nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+                         unsigned int len, struct nfsd_cacherep **cacherep);
 void   nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
                          int cachetype, __be32 *statp);
 int    nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
index 4045c85..4041592 100644 (file)
@@ -2804,7 +2804,7 @@ static int client_opens_release(struct inode *inode, struct file *file)
 
        /* XXX: alternatively, we could get/drop in seq start/stop */
        drop_client(clp);
-       return 0;
+       return seq_release(inode, file);
 }
 
 static const struct file_operations client_states_fops = {
index fd56a52..d3273a3 100644 (file)
@@ -369,33 +369,52 @@ nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
        return freed;
 }
 
-/*
- * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
+/**
+ * nfsd_cache_csum - Checksum incoming NFS Call arguments
+ * @buf: buffer containing a whole RPC Call message
+ * @start: starting byte of the NFS Call header
+ * @remaining: size of the NFS Call header, in bytes
+ *
+ * Compute a weak checksum of the leading bytes of an NFS procedure
+ * call header to help verify that a retransmitted Call matches an
+ * entry in the duplicate reply cache.
+ *
+ * To avoid assumptions about how the RPC message is laid out in
+ * @buf and what else it might contain (eg, a GSS MIC suffix), the
+ * caller passes us the exact location and length of the NFS Call
+ * header.
+ *
+ * Returns a 32-bit checksum value, as defined in RFC 793.
  */
-static __wsum
-nfsd_cache_csum(struct svc_rqst *rqstp)
+static __wsum nfsd_cache_csum(struct xdr_buf *buf, unsigned int start,
+                             unsigned int remaining)
 {
+       unsigned int base, len;
+       struct xdr_buf subbuf;
+       __wsum csum = 0;
+       void *p;
        int idx;
-       unsigned int base;
-       __wsum csum;
-       struct xdr_buf *buf = &rqstp->rq_arg;
-       const unsigned char *p = buf->head[0].iov_base;
-       size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
-                               RC_CSUMLEN);
-       size_t len = min(buf->head[0].iov_len, csum_len);
+
+       if (remaining > RC_CSUMLEN)
+               remaining = RC_CSUMLEN;
+       if (xdr_buf_subsegment(buf, &subbuf, start, remaining))
+               return csum;
 
        /* rq_arg.head first */
-       csum = csum_partial(p, len, 0);
-       csum_len -= len;
+       if (subbuf.head[0].iov_len) {
+               len = min_t(unsigned int, subbuf.head[0].iov_len, remaining);
+               csum = csum_partial(subbuf.head[0].iov_base, len, csum);
+               remaining -= len;
+       }
 
        /* Continue into page array */
-       idx = buf->page_base / PAGE_SIZE;
-       base = buf->page_base & ~PAGE_MASK;
-       while (csum_len) {
-               p = page_address(buf->pages[idx]) + base;
-               len = min_t(size_t, PAGE_SIZE - base, csum_len);
+       idx = subbuf.page_base / PAGE_SIZE;
+       base = subbuf.page_base & ~PAGE_MASK;
+       while (remaining) {
+               p = page_address(subbuf.pages[idx]) + base;
+               len = min_t(unsigned int, PAGE_SIZE - base, remaining);
                csum = csum_partial(p, len, csum);
-               csum_len -= len;
+               remaining -= len;
                base = 0;
                ++idx;
        }
@@ -466,6 +485,8 @@ out:
 /**
  * nfsd_cache_lookup - Find an entry in the duplicate reply cache
  * @rqstp: Incoming Call to find
+ * @start: starting byte in @rqstp->rq_arg of the NFS Call header
+ * @len: size of the NFS Call header, in bytes
  * @cacherep: OUT: DRC entry for this request
  *
  * Try to find an entry matching the current call in the cache. When none
@@ -479,7 +500,8 @@ out:
  *   %RC_REPLY: Reply from cache
  *   %RC_DROPIT: Do not process the request further
  */
-int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
+int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
+                     unsigned int len, struct nfsd_cacherep **cacherep)
 {
        struct nfsd_net         *nn;
        struct nfsd_cacherep    *rp, *found;
@@ -495,7 +517,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
                goto out;
        }
 
-       csum = nfsd_cache_csum(rqstp);
+       csum = nfsd_cache_csum(&rqstp->rq_arg, start, len);
 
        /*
         * Since the common case is a cache miss followed by an insert,
@@ -641,24 +663,17 @@ void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
        return;
 }
 
-/*
- * Copy cached reply to current reply buffer. Should always fit.
- * FIXME as reply is in a page, we should just attach the page, and
- * keep a refcount....
- */
 static int
 nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
 {
-       struct kvec     *vec = &rqstp->rq_res.head[0];
-
-       if (vec->iov_len + data->iov_len > PAGE_SIZE) {
-               printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n",
-                               data->iov_len);
-               return 0;
-       }
-       memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
-       vec->iov_len += data->iov_len;
-       return 1;
+       __be32 *p;
+
+       p = xdr_reserve_space(&rqstp->rq_res_stream, data->iov_len);
+       if (unlikely(!p))
+               return false;
+       memcpy(p, data->iov_base, data->iov_len);
+       xdr_commit_encode(&rqstp->rq_res_stream);
+       return true;
 }
 
 /*
index d6122bb..fe61d9b 100644 (file)
@@ -981,6 +981,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
        const struct svc_procedure *proc = rqstp->rq_procinfo;
        __be32 *statp = rqstp->rq_accept_statp;
        struct nfsd_cacherep *rp;
+       unsigned int start, len;
+       __be32 *nfs_reply;
 
        /*
         * Give the xdr decoder a chance to change this if it wants
@@ -988,6 +990,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
         */
        rqstp->rq_cachetype = proc->pc_cachetype;
 
+       /*
+        * ->pc_decode advances the argument stream past the NFS
+        * Call header, so grab the header's starting location and
+        * size now for the call to nfsd_cache_lookup().
+        */
+       start = xdr_stream_pos(&rqstp->rq_arg_stream);
+       len = xdr_stream_remaining(&rqstp->rq_arg_stream);
        if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
                goto out_decode_err;
 
@@ -1001,7 +1010,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
        smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1);
 
        rp = NULL;
-       switch (nfsd_cache_lookup(rqstp, &rp)) {
+       switch (nfsd_cache_lookup(rqstp, start, len, &rp)) {
        case RC_DOIT:
                break;
        case RC_REPLY:
@@ -1010,6 +1019,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
                goto out_dropit;
        }
 
+       nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0);
        *statp = proc->pc_func(rqstp);
        if (test_bit(RQ_DROPME, &rqstp->rq_flags))
                goto out_update_drop;
@@ -1023,7 +1033,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
         */
        smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1);
 
-       nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
+       nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply);
 out_cached_reply:
        return 1;
 
index ddab9ea..3fe2dde 100644 (file)
@@ -430,7 +430,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
        struct ovl_fs_context *ctx = fc->fs_private;
        struct ovl_fs_context_layer *l;
        char *dup = NULL, *iter;
-       ssize_t nr_lower = 0, nr = 0, nr_data = 0;
+       ssize_t nr_lower, nr;
        bool data_layer = false;
 
        /*
@@ -482,6 +482,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
        iter = dup;
        l = ctx->lower;
        for (nr = 0; nr < nr_lower; nr++, l++) {
+               ctx->nr++;
                memset(l, 0, sizeof(*l));
 
                err = ovl_mount_dir(iter, &l->path);
@@ -498,10 +499,10 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
                        goto out_put;
 
                if (data_layer)
-                       nr_data++;
+                       ctx->nr_data++;
 
                /* Calling strchr() again would overrun. */
-               if ((nr + 1) == nr_lower)
+               if (ctx->nr == nr_lower)
                        break;
 
                err = -EINVAL;
@@ -511,7 +512,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
                         * This is a regular layer so we require that
                         * there are no data layers.
                         */
-                       if ((ctx->nr_data + nr_data) > 0) {
+                       if (ctx->nr_data > 0) {
                                pr_err("regular lower layers cannot follow data lower layers");
                                goto out_put;
                        }
@@ -524,8 +525,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
                data_layer = true;
                iter++;
        }
-       ctx->nr = nr_lower;
-       ctx->nr_data += nr_data;
        kfree(dup);
        return 0;
 
index 50a201e..c3f020c 100644 (file)
@@ -978,7 +978,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper,
        return 0;
 }
 
-/**
+/*
  * Caller must hold a reference to inode to prevent it from being freed while
  * it is marked inuse.
  */
index 6f3285f..af7849e 100644 (file)
@@ -64,8 +64,8 @@ struct key_type cifs_spnego_key_type = {
  * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN       13
 
-/* strlen of "host=" */
-#define HOST_KEY_LEN           5
+/* strlen of ";host=" */
+#define HOST_KEY_LEN           6
 
 /* strlen of ";ip4=" or ";ip6=" */
 #define IP_KEY_LEN             5
index 57c2a7d..f896f60 100644 (file)
@@ -2065,6 +2065,12 @@ void __cifs_put_smb_ses(struct cifs_ses *ses)
                ses->chans[i].server = NULL;
        }
 
+       /* we now account for primary channel in iface->refcount */
+       if (ses->chans[0].iface) {
+               kref_put(&ses->chans[0].iface->refcount, release_iface);
+               ses->chans[0].server = NULL;
+       }
+
        sesInfoFree(ses);
        cifs_put_tcp_session(server, 0);
 }
index 0bb2ac9..8b2d7c1 100644 (file)
@@ -322,28 +322,32 @@ cifs_disable_secondary_channels(struct cifs_ses *ses)
                iface = ses->chans[i].iface;
                server = ses->chans[i].server;
 
+               /*
+                * remove these references first, since we need to unlock
+                * the chan_lock here, since iface_lock is a higher lock
+                */
+               ses->chans[i].iface = NULL;
+               ses->chans[i].server = NULL;
+               spin_unlock(&ses->chan_lock);
+
                if (iface) {
                        spin_lock(&ses->iface_lock);
                        kref_put(&iface->refcount, release_iface);
-                       ses->chans[i].iface = NULL;
                        iface->num_channels--;
                        if (iface->weight_fulfilled)
                                iface->weight_fulfilled--;
                        spin_unlock(&ses->iface_lock);
                }
 
-               spin_unlock(&ses->chan_lock);
-               if (server && !server->terminate) {
-                       server->terminate = true;
-                       cifs_signal_cifsd_for_reconnect(server, false);
-               }
-               spin_lock(&ses->chan_lock);
-
                if (server) {
-                       ses->chans[i].server = NULL;
+                       if (!server->terminate) {
+                               server->terminate = true;
+                               cifs_signal_cifsd_for_reconnect(server, false);
+                       }
                        cifs_put_tcp_session(server, false);
                }
 
+               spin_lock(&ses->chan_lock);
        }
 
 done:
index 84ea673..5a3ca62 100644 (file)
@@ -458,6 +458,8 @@ generate_smb3signingkey(struct cifs_ses *ses,
                                  ptriplet->encryption.context,
                                  ses->smb3encryptionkey,
                                  SMB3_ENC_DEC_KEY_SIZE);
+               if (rc)
+                       return rc;
                rc = generate_key(ses, ptriplet->decryption.label,
                                  ptriplet->decryption.context,
                                  ses->smb3decryptionkey,
@@ -466,9 +468,6 @@ generate_smb3signingkey(struct cifs_ses *ses,
                        return rc;
        }
 
-       if (rc)
-               return rc;
-
 #ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
        cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__);
        /*
index ed0bc8c..567fb37 100644 (file)
@@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS
        bool "XFS online metadata check usage data collection"
        default y
        depends on XFS_ONLINE_SCRUB
-       select XFS_DEBUG
+       select DEBUG_FS
        help
          If you say Y here, the kernel will gather usage data about
          the online metadata check subsystem.  This includes the number
index 3069194..100ab59 100644 (file)
@@ -2275,16 +2275,37 @@ xfs_alloc_min_freelist(
 
        ASSERT(mp->m_alloc_maxlevels > 0);
 
+       /*
+        * For a btree shorter than the maximum height, the worst case is that
+        * every level gets split and a new level is added, then while inserting
+        * another entry to refill the AGFL, every level under the old root gets
+        * split again. This is:
+        *
+        *   (full height split reservation) + (AGFL refill split height)
+        * = (current height + 1) + (current height - 1)
+        * = (new height) + (new height - 2)
+        * = 2 * new height - 2
+        *
+        * For a btree of maximum height, the worst case is that every level
+        * under the root gets split, then while inserting another entry to
+        * refill the AGFL, every level under the root gets split again. This is
+        * also:
+        *
+        *   2 * (current height - 1)
+        * = 2 * (new height - 1)
+        * = 2 * new height - 2
+        */
+
        /* space needed by-bno freespace btree */
        min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
-                                      mp->m_alloc_maxlevels);
+                                      mp->m_alloc_maxlevels) * 2 - 2;
        /* space needed by-size freespace btree */
        min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
-                                      mp->m_alloc_maxlevels);
+                                      mp->m_alloc_maxlevels) * 2 - 2;
        /* space needed reverse mapping used space btree */
        if (xfs_has_rmapbt(mp))
                min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
-                                               mp->m_rmap_maxlevels);
+                                               mp->m_rmap_maxlevels) * 2 - 2;
 
        return min_free;
 }
index bcfb6a4..f71679c 100644 (file)
@@ -245,21 +245,18 @@ xfs_defer_create_intents(
        return ret;
 }
 
-/* Abort all the intents that were committed. */
 STATIC void
-xfs_defer_trans_abort(
-       struct xfs_trans                *tp,
-       struct list_head                *dop_pending)
+xfs_defer_pending_abort(
+       struct xfs_mount                *mp,
+       struct list_head                *dop_list)
 {
        struct xfs_defer_pending        *dfp;
        const struct xfs_defer_op_type  *ops;
 
-       trace_xfs_defer_trans_abort(tp, _RET_IP_);
-
        /* Abort intent items that don't have a done item. */
-       list_for_each_entry(dfp, dop_pending, dfp_list) {
+       list_for_each_entry(dfp, dop_list, dfp_list) {
                ops = defer_op_types[dfp->dfp_type];
-               trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
+               trace_xfs_defer_pending_abort(mp, dfp);
                if (dfp->dfp_intent && !dfp->dfp_done) {
                        ops->abort_intent(dfp->dfp_intent);
                        dfp->dfp_intent = NULL;
@@ -267,6 +264,16 @@ xfs_defer_trans_abort(
        }
 }
 
+/* Abort all the intents that were committed. */
+STATIC void
+xfs_defer_trans_abort(
+       struct xfs_trans                *tp,
+       struct list_head                *dop_pending)
+{
+       trace_xfs_defer_trans_abort(tp, _RET_IP_);
+       xfs_defer_pending_abort(tp->t_mountp, dop_pending);
+}
+
 /*
  * Capture resources that the caller said not to release ("held") when the
  * transaction commits.  Caller is responsible for zero-initializing @dres.
@@ -756,12 +763,13 @@ xfs_defer_ops_capture(
 
 /* Release all resources that we used to capture deferred ops. */
 void
-xfs_defer_ops_capture_free(
+xfs_defer_ops_capture_abort(
        struct xfs_mount                *mp,
        struct xfs_defer_capture        *dfc)
 {
        unsigned short                  i;
 
+       xfs_defer_pending_abort(mp, &dfc->dfc_dfops);
        xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
 
        for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
@@ -802,7 +810,7 @@ xfs_defer_ops_capture_and_commit(
        /* Commit the transaction and add the capture structure to the list. */
        error = xfs_trans_commit(tp);
        if (error) {
-               xfs_defer_ops_capture_free(mp, dfc);
+               xfs_defer_ops_capture_abort(mp, dfc);
                return error;
        }
 
index 114a3a4..8788ad5 100644 (file)
@@ -121,7 +121,7 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
                struct list_head *capture_list);
 void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
                struct xfs_defer_resources *dres);
-void xfs_defer_ops_capture_free(struct xfs_mount *mp,
+void xfs_defer_ops_capture_abort(struct xfs_mount *mp,
                struct xfs_defer_capture *d);
 void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
 
index 543f374..137a65b 100644 (file)
@@ -510,6 +510,9 @@ xfs_dinode_verify(
        if (mode && nextents + naextents > nblocks)
                return __this_address;
 
+       if (nextents + naextents == 0 && nblocks != 0)
+               return __this_address;
+
        if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
                return __this_address;
 
index 0e5dba2..144198a 100644 (file)
@@ -286,6 +286,7 @@ xlog_recover_inode_commit_pass2(
        struct xfs_log_dinode           *ldip;
        uint                            isize;
        int                             need_free = 0;
+       xfs_failaddr_t                  fa;
 
        if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
                in_f = item->ri_buf[0].i_addr;
@@ -369,24 +370,26 @@ xlog_recover_inode_commit_pass2(
         * superblock flag to determine whether we need to look at di_flushiter
         * to skip replay when the on disk inode is newer than the log one
         */
-       if (!xfs_has_v3inodes(mp) &&
-           ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
-               /*
-                * Deal with the wrap case, DI_MAX_FLUSH is less
-                * than smaller numbers
-                */
-               if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
-                   ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
-                       /* do nothing */
-               } else {
-                       trace_xfs_log_recover_inode_skip(log, in_f);
-                       error = 0;
-                       goto out_release;
+       if (!xfs_has_v3inodes(mp)) {
+               if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+                       /*
+                        * Deal with the wrap case, DI_MAX_FLUSH is less
+                        * than smaller numbers
+                        */
+                       if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+                           ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+                               /* do nothing */
+                       } else {
+                               trace_xfs_log_recover_inode_skip(log, in_f);
+                               error = 0;
+                               goto out_release;
+                       }
                }
+
+               /* Take the opportunity to reset the flush iteration count */
+               ldip->di_flushiter = 0;
        }
 
-       /* Take the opportunity to reset the flush iteration count */
-       ldip->di_flushiter = 0;
 
        if (unlikely(S_ISREG(ldip->di_mode))) {
                if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -528,8 +531,19 @@ out_owner_change:
            (dip->di_mode != 0))
                error = xfs_recover_inode_owner_change(mp, dip, in_f,
                                                       buffer_list);
-       /* re-generate the checksum. */
+       /* re-generate the checksum and validate the recovered inode. */
        xfs_dinode_calc_crc(log->l_mp, dip);
+       fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip);
+       if (fa) {
+               XFS_CORRUPTION_ERROR(
+                       "Bad dinode after recovery",
+                               XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip));
+               xfs_alert(mp,
+                       "Metadata corruption detected at %pS, inode 0x%llx",
+                       fa, in_f->ilf_ino);
+               error = -EFSCORRUPTED;
+               goto out_release;
+       }
 
        ASSERT(bp->b_mount == mp);
        bp->b_flags |= _XBF_LOGRECOVERY;
index 51c100c..ee206fa 100644 (file)
@@ -1893,9 +1893,7 @@ xlog_write_iclog(
                 * the buffer manually, the code needs to be kept in sync
                 * with the I/O completion path.
                 */
-               xlog_state_done_syncing(iclog);
-               up(&iclog->ic_sema);
-               return;
+               goto sync;
        }
 
        /*
@@ -1925,20 +1923,17 @@ xlog_write_iclog(
                 * avoid shutdown re-entering this path and erroring out again.
                 */
                if (log->l_targ != log->l_mp->m_ddev_targp &&
-                   blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) {
-                       xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
-                       return;
-               }
+                   blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev))
+                       goto shutdown;
        }
        if (iclog->ic_flags & XLOG_ICL_NEED_FUA)
                iclog->ic_bio.bi_opf |= REQ_FUA;
 
        iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
 
-       if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
-               xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
-               return;
-       }
+       if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
+               goto shutdown;
+
        if (is_vmalloc_addr(iclog->ic_data))
                flush_kernel_vmap_range(iclog->ic_data, count);
 
@@ -1959,6 +1954,12 @@ xlog_write_iclog(
        }
 
        submit_bio(&iclog->ic_bio);
+       return;
+shutdown:
+       xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+sync:
+       xlog_state_done_syncing(iclog);
+       up(&iclog->ic_sema);
 }
 
 /*
index 13b94d2..a1e18b2 100644 (file)
@@ -2511,7 +2511,7 @@ xlog_abort_defer_ops(
 
        list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
                list_del_init(&dfc->dfc_list);
-               xfs_defer_ops_capture_free(mp, dfc);
+               xfs_defer_ops_capture_abort(mp, dfc);
        }
 }
 
index 658edee..e5b62dc 100644 (file)
@@ -784,6 +784,7 @@ xfs_reflink_end_cow_extent(
                }
        }
        del = got;
+       xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb);
 
        /* Grab the corresponding mapping in the data fork. */
        nmaps = 1;
index b4825d3..6762dac 100644 (file)
@@ -56,7 +56,7 @@ extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
 extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
-extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
+extern bool bpf_global_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
@@ -909,10 +909,14 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
        aux->ctx_field_size = size;
 }
 
+static bool bpf_is_ldimm64(const struct bpf_insn *insn)
+{
+       return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
+}
+
 static inline bool bpf_pseudo_func(const struct bpf_insn *insn)
 {
-       return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
-              insn->src_reg == BPF_PSEUDO_FUNC;
+       return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
 }
 
 struct bpf_prog_ops {
index d305db7..efc0c0b 100644 (file)
@@ -195,6 +195,7 @@ enum cpuhp_state {
        CPUHP_AP_ARM_CORESIGHT_CTI_STARTING,
        CPUHP_AP_ARM64_ISNDEP_STARTING,
        CPUHP_AP_SMPCFD_DYING,
+       CPUHP_AP_HRTIMERS_DYING,
        CPUHP_AP_X86_TBOOT_DYING,
        CPUHP_AP_ARM_CACHE_B15_RAC_DYING,
        CPUHP_AP_ONLINE,
index 0ee1401..f2044d5 100644 (file)
@@ -531,9 +531,9 @@ extern void sysrq_timer_list_show(void);
 
 int hrtimers_prepare_cpu(unsigned int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-int hrtimers_dead_cpu(unsigned int cpu);
+int hrtimers_cpu_dying(unsigned int cpu);
 #else
-#define hrtimers_dead_cpu      NULL
+#define hrtimers_cpu_dying     NULL
 #endif
 
 #endif
index 8fa23bd..007fd9c 100644 (file)
@@ -420,7 +420,7 @@ static inline u32 linkmode_adv_to_mii_t1_adv_m_t(unsigned long *advertising)
  * A function that translates value of following registers to the linkmode:
  * IEEE 802.3-2018 45.2.3.10 "EEE control and capability 1" register (3.20)
  * IEEE 802.3-2018 45.2.7.13 "EEE advertisement 1" register (7.60)
- * IEEE 802.3-2018 45.2.7.14 "EEE "link partner ability 1 register (7.61)
+ * IEEE 802.3-2018 45.2.7.14 "EEE link partner ability 1" register (7.61)
  */
 static inline void mii_eee_cap1_mod_linkmode_t(unsigned long *adv, u32 val)
 {
index afb028c..5547ba6 100644 (file)
@@ -843,11 +843,11 @@ struct perf_event {
 };
 
 /*
- *           ,-----------------------[1:n]----------------------.
- *           V                                                  V
- * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
- *           ^                      ^     |                     |
- *           `--------[1:n]---------'     `-[n:1]-> pmu <-[1:n]-'
+ *           ,-----------------------[1:n]------------------------.
+ *           V                                                    V
+ * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event
+ *                                        |                       |
+ *                                        `--[n:1]-> pmu <-[1:n]--'
  *
  *
  * struct perf_event_pmu_context  lifetime is refcount based and RCU freed
@@ -865,6 +865,9 @@ struct perf_event {
  * ctx->mutex pinning the configuration. Since we hold a reference on
  * group_leader (through the filedesc) it can't go away, therefore it's
  * associated pmu_ctx must exist and cannot change due to ctx->mutex.
+ *
+ * perf_event holds a refcount on perf_event_context
+ * perf_event holds a refcount on perf_event_pmu_context
  */
 struct perf_event_pmu_context {
        struct pmu                      *pmu;
index c36e7a3..3be2cb5 100644 (file)
@@ -14,6 +14,7 @@
 
 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
 #include <asm/stacktrace.h>
+#include <linux/linkage.h>
 
 /*
  * The lowest address on tsk's stack which we can plausibly erase.
@@ -76,6 +77,11 @@ static inline void stackleak_task_init(struct task_struct *t)
 # endif
 }
 
+asmlinkage void noinstr stackleak_erase(void);
+asmlinkage void noinstr stackleak_erase_on_task_stack(void);
+asmlinkage void noinstr stackleak_erase_off_task_stack(void);
+void __no_caller_saved_registers noinstr stackleak_track_stack(void);
+
 #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */
 static inline void stackleak_task_init(struct task_struct *t) { }
 #endif
index d0f2797..a09e13a 100644 (file)
@@ -5,13 +5,6 @@
 #include <linux/pci.h>
 #include <linux/virtio_pci.h>
 
-struct virtio_pci_modern_common_cfg {
-       struct virtio_pci_common_cfg cfg;
-
-       __le16 queue_notify_data;       /* read-write */
-       __le16 queue_reset;             /* read-write */
-};
-
 /**
  * struct virtio_pci_modern_device - info for modern PCI virtio
  * @pci_dev:       Ptr to the PCI device struct
index 3bbd13a..b157c5c 100644 (file)
@@ -178,9 +178,9 @@ static inline __be32 nft_reg_load_be32(const u32 *sreg)
        return *(__force __be32 *)sreg;
 }
 
-static inline void nft_reg_store64(u32 *dreg, u64 val)
+static inline void nft_reg_store64(u64 *dreg, u64 val)
 {
-       put_unaligned(val, (u64 *)dreg);
+       put_unaligned(val, dreg);
 }
 
 static inline u64 nft_reg_load64(const u32 *sreg)
index 8a6dbfb..77f87c6 100644 (file)
@@ -58,6 +58,11 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a)
        return to_ct_params(a)->nf_ft;
 }
 
+static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a)
+{
+       return to_ct_params(a)->helper;
+}
+
 #else
 static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return 0; }
 static inline int tcf_ct_action(const struct tc_action *a) { return 0; }
@@ -65,6 +70,10 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a)
 {
        return NULL;
 }
+static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a)
+{
+       return NULL;
+}
 #endif /* CONFIG_NF_CONNTRACK */
 
 #if IS_ENABLED(CONFIG_NET_ACT_CT)
index c25fc96..d24e8e1 100644 (file)
  */
 #define BTRFS_METADATA_ITEM_KEY        169
 
+/*
+ * Special inline ref key which stores the id of the subvolume which originally
+ * created the extent. This subvolume owns the extent permanently from the
+ * perspective of simple quotas. Needed to know which subvolume to free quota
+ * usage from when the extent is deleted.
+ *
+ * Stored as an inline ref rather to avoid wasting space on a separate item on
+ * top of the existing extent item. However, unlike the other inline refs,
+ * there is one one owner ref per extent rather than one per extent.
+ *
+ * Because of this, it goes at the front of the list of inline refs, and thus
+ * must have a lower type value than any other inline ref type (to satisfy the
+ * disk format rule that inline refs have non-decreasing type).
+ */
+#define BTRFS_EXTENT_OWNER_REF_KEY     172
+
 #define BTRFS_TREE_BLOCK_REF_KEY       176
 
 #define BTRFS_EXTENT_DATA_REF_KEY      178
 
 #define BTRFS_SHARED_DATA_REF_KEY      184
 
-/*
- * Special inline ref key which stores the id of the subvolume which originally
- * created the extent. This subvolume owns the extent permanently from the
- * perspective of simple quotas. Needed to know which subvolume to free quota
- * usage from when the extent is deleted.
- */
-#define BTRFS_EXTENT_OWNER_REF_KEY     188
-
 /*
  * block groups give us hints into the extent allocation trees.  Which
  * blocks are free etc etc
index f703afc..44f4dd2 100644 (file)
@@ -166,6 +166,17 @@ struct virtio_pci_common_cfg {
        __le32 queue_used_hi;           /* read-write */
 };
 
+/*
+ * Warning: do not use sizeof on this: use offsetofend for
+ * specific fields you need.
+ */
+struct virtio_pci_modern_common_cfg {
+       struct virtio_pci_common_cfg cfg;
+
+       __le16 queue_notify_data;       /* read-write */
+       __le16 queue_reset;             /* read-write */
+};
+
 /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */
 struct virtio_pci_cfg_cap {
        struct virtio_pci_cap cap;
index 23932b0..3b07409 100644 (file)
@@ -88,7 +88,6 @@ void xen_irq_resume(void);
 
 /* Clear an irq's pending state, in preparation for polling on it */
 void xen_clear_irq_pending(int irq);
-void xen_set_irq_pending(int irq);
 bool xen_test_irq_pending(int irq);
 
 /* Poll waiting for an irq to become pending.  In the usual case, the
@@ -101,8 +100,8 @@ void xen_poll_irq_timeout(int irq, u64 timeout);
 
 /* Determine the IRQ which is bound to an event channel */
 unsigned int irq_from_evtchn(evtchn_port_t evtchn);
-int irq_from_virq(unsigned int cpu, unsigned int virq);
-evtchn_port_t evtchn_from_irq(unsigned irq);
+int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq,
+                        evtchn_port_t *evtchn);
 
 int xen_set_callback_via(uint64_t via);
 int xen_evtchn_do_upcall(void);
@@ -122,9 +121,6 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 /* De-allocates the above mentioned physical interrupt. */
 int xen_destroy_irq(int irq);
 
-/* Return irq from pirq */
-int xen_irq_from_pirq(unsigned pirq);
-
 /* Return the pirq allocated to the irq. */
 int xen_pirq_from_irq(unsigned irq);
 
index f04a430..976e950 100644 (file)
@@ -145,13 +145,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
        if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
                struct io_sq_data *sq = ctx->sq_data;
 
-               if (mutex_trylock(&sq->lock)) {
-                       if (sq->thread) {
-                               sq_pid = task_pid_nr(sq->thread);
-                               sq_cpu = task_cpu(sq->thread);
-                       }
-                       mutex_unlock(&sq->lock);
-               }
+               sq_pid = sq->task_pid;
+               sq_cpu = sq->sq_cpu;
        }
 
        seq_printf(m, "SqThread:\t%d\n", sq_pid);
index bd6c2c7..65b5dbe 100644 (file)
@@ -214,6 +214,7 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd)
                        did_sig = get_signal(&ksig);
                cond_resched();
                mutex_lock(&sqd->lock);
+               sqd->sq_cpu = raw_smp_processor_id();
        }
        return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
 }
@@ -229,10 +230,15 @@ static int io_sq_thread(void *data)
        snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
        set_task_comm(current, buf);
 
-       if (sqd->sq_cpu != -1)
+       /* reset to our pid after we've set task_comm, for fdinfo */
+       sqd->task_pid = current->pid;
+
+       if (sqd->sq_cpu != -1) {
                set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
-       else
+       } else {
                set_cpus_allowed_ptr(current, cpu_online_mask);
+               sqd->sq_cpu = raw_smp_processor_id();
+       }
 
        mutex_lock(&sqd->lock);
        while (1) {
@@ -261,6 +267,7 @@ static int io_sq_thread(void *data)
                                mutex_unlock(&sqd->lock);
                                cond_resched();
                                mutex_lock(&sqd->lock);
+                               sqd->sq_cpu = raw_smp_processor_id();
                        }
                        continue;
                }
@@ -294,6 +301,7 @@ static int io_sq_thread(void *data)
                                mutex_unlock(&sqd->lock);
                                schedule();
                                mutex_lock(&sqd->lock);
+                               sqd->sq_cpu = raw_smp_processor_id();
                        }
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                atomic_andnot(IORING_SQ_NEED_WAKEUP,
index 91e82e3..7a98cd1 100644 (file)
@@ -531,7 +531,7 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
        if (tsk != current)
                return 0;
 
-       if (WARN_ON_ONCE(!current->mm))
+       if (!current->mm)
                return 0;
        exe_file = get_mm_exe_file(current->mm);
        if (!exe_file)
index 08626b5..cd3afe5 100644 (file)
@@ -64,8 +64,8 @@
 #define OFF    insn->off
 #define IMM    insn->imm
 
-struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
-bool bpf_global_ma_set, bpf_global_percpu_ma_set;
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
 
 /* No hurry in this branch
  *
@@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void)
 
        ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
        bpf_global_ma_set = !ret;
-       ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
-       bpf_global_percpu_ma_set = !ret;
-       return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
+       return ret;
 }
 late_initcall(bpf_global_ma_init);
 #endif
index bd1c42e..6da370a 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/poison.h>
 #include <linux/module.h>
 #include <linux/cpumask.h>
+#include <linux/bpf_mem_alloc.h>
 #include <net/xdp.h>
 
 #include "disasm.h"
@@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
 #undef BPF_LINK_TYPE
 };
 
+struct bpf_mem_alloc bpf_global_percpu_ma;
+static bool bpf_global_percpu_ma_set;
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta {
 struct btf *btf_vmlinux;
 
 static DEFINE_MUTEX(bpf_verifier_lock);
+static DEFINE_MUTEX(bpf_percpu_ma_lock);
 
 static const struct bpf_line_info *
 find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
@@ -3516,12 +3521,29 @@ static int push_jmp_history(struct bpf_verifier_env *env,
 
 /* Backtrack one insn at a time. If idx is not at the top of recorded
  * history then previous instruction came from straight line execution.
+ * Return -ENOENT if we exhausted all instructions within given state.
+ *
+ * It's legal to have a bit of a looping with the same starting and ending
+ * insn index within the same state, e.g.: 3->4->5->3, so just because current
+ * instruction index is the same as state's first_idx doesn't mean we are
+ * done. If there is still some jump history left, we should keep going. We
+ * need to take into account that we might have a jump history between given
+ * state's parent and itself, due to checkpointing. In this case, we'll have
+ * history entry recording a jump from last instruction of parent state and
+ * first instruction of given state.
  */
 static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
                             u32 *history)
 {
        u32 cnt = *history;
 
+       if (i == st->first_insn_idx) {
+               if (cnt == 0)
+                       return -ENOENT;
+               if (cnt == 1 && st->jmp_history[0].idx == i)
+                       return -ENOENT;
+       }
+
        if (cnt && st->jmp_history[cnt - 1].idx == i) {
                i = st->jmp_history[cnt - 1].prev_idx;
                (*history)--;
@@ -4401,10 +4423,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
                                 * Nothing to be tracked further in the parent state.
                                 */
                                return 0;
-                       if (i == first_idx)
-                               break;
                        subseq_idx = i;
                        i = get_prev_insn_idx(st, i, &history);
+                       if (i == -ENOENT)
+                               break;
                        if (i >= env->prog->len) {
                                /* This can happen if backtracking reached insn 0
                                 * and there are still reg_mask or stack_mask
@@ -12074,8 +12096,19 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                                if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
                                        return -ENOMEM;
 
-                               if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set)
-                                       return -ENOMEM;
+                               if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+                                       if (!bpf_global_percpu_ma_set) {
+                                               mutex_lock(&bpf_percpu_ma_lock);
+                                               if (!bpf_global_percpu_ma_set) {
+                                                       err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
+                                                       if (!err)
+                                                               bpf_global_percpu_ma_set = true;
+                                               }
+                                               mutex_unlock(&bpf_percpu_ma_lock);
+                                               if (err)
+                                                       return err;
+                                       }
+                               }
 
                                if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
                                        verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
@@ -15386,8 +15419,7 @@ enum {
  * w - next instruction
  * e - edge
  */
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
-                    bool loop_ok)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 {
        int *insn_stack = env->cfg.insn_stack;
        int *insn_state = env->cfg.insn_state;
@@ -15419,7 +15451,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
                insn_stack[env->cfg.cur_stack++] = w;
                return KEEP_EXPLORING;
        } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
-               if (loop_ok && env->bpf_capable)
+               if (env->bpf_capable)
                        return DONE_EXPLORING;
                verbose_linfo(env, t, "%d: ", t);
                verbose_linfo(env, w, "%d: ", w);
@@ -15439,24 +15471,20 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
                                struct bpf_verifier_env *env,
                                bool visit_callee)
 {
-       int ret;
+       int ret, insn_sz;
 
-       ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
+       insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
+       ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
        if (ret)
                return ret;
 
-       mark_prune_point(env, t + 1);
+       mark_prune_point(env, t + insn_sz);
        /* when we exit from subprog, we need to record non-linear history */
-       mark_jmp_point(env, t + 1);
+       mark_jmp_point(env, t + insn_sz);
 
        if (visit_callee) {
                mark_prune_point(env, t);
-               ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
-                               /* It's ok to allow recursion from CFG point of
-                                * view. __check_func_call() will do the actual
-                                * check.
-                                */
-                               bpf_pseudo_func(insns + t));
+               ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
        }
        return ret;
 }
@@ -15469,15 +15497,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
 static int visit_insn(int t, struct bpf_verifier_env *env)
 {
        struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
-       int ret, off;
+       int ret, off, insn_sz;
 
        if (bpf_pseudo_func(insn))
                return visit_func_call_insn(t, insns, env, true);
 
        /* All non-branch instructions have a single fall-through edge. */
        if (BPF_CLASS(insn->code) != BPF_JMP &&
-           BPF_CLASS(insn->code) != BPF_JMP32)
-               return push_insn(t, t + 1, FALLTHROUGH, env, false);
+           BPF_CLASS(insn->code) != BPF_JMP32) {
+               insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+               return push_insn(t, t + insn_sz, FALLTHROUGH, env);
+       }
 
        switch (BPF_OP(insn->code)) {
        case BPF_EXIT:
@@ -15523,8 +15553,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
                        off = insn->imm;
 
                /* unconditional jump with single edge */
-               ret = push_insn(t, t + off + 1, FALLTHROUGH, env,
-                               true);
+               ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
                if (ret)
                        return ret;
 
@@ -15537,11 +15566,11 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
                /* conditional jump with two edges */
                mark_prune_point(env, t);
 
-               ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
+               ret = push_insn(t, t + 1, FALLTHROUGH, env);
                if (ret)
                        return ret;
 
-               return push_insn(t, t + insn->off + 1, BRANCH, env, true);
+               return push_insn(t, t + insn->off + 1, BRANCH, env);
        }
 }
 
@@ -15607,11 +15636,21 @@ walk_cfg:
        }
 
        for (i = 0; i < insn_cnt; i++) {
+               struct bpf_insn *insn = &env->prog->insnsi[i];
+
                if (insn_state[i] != EXPLORED) {
                        verbose(env, "unreachable insn %d\n", i);
                        ret = -EINVAL;
                        goto err_free;
                }
+               if (bpf_is_ldimm64(insn)) {
+                       if (insn_state[i + 1] != 0) {
+                               verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
+                               ret = -EINVAL;
+                               goto err_free;
+                       }
+                       i++; /* skip second half of ldimm64 */
+               }
        }
        ret = 0; /* cfg looks good */
 
index 1d5b9de..4b9ff41 100644 (file)
@@ -3885,14 +3885,6 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
        return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
 }
 
-static int cgroup_pressure_open(struct kernfs_open_file *of)
-{
-       if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
-               return -EPERM;
-
-       return 0;
-}
-
 static void cgroup_pressure_release(struct kernfs_open_file *of)
 {
        struct cgroup_file_ctx *ctx = of->priv;
@@ -5299,7 +5291,6 @@ static struct cftype cgroup_psi_files[] = {
        {
                .name = "io.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
-               .open = cgroup_pressure_open,
                .seq_show = cgroup_io_pressure_show,
                .write = cgroup_io_pressure_write,
                .poll = cgroup_pressure_poll,
@@ -5308,7 +5299,6 @@ static struct cftype cgroup_psi_files[] = {
        {
                .name = "memory.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
-               .open = cgroup_pressure_open,
                .seq_show = cgroup_memory_pressure_show,
                .write = cgroup_memory_pressure_write,
                .poll = cgroup_pressure_poll,
@@ -5317,7 +5307,6 @@ static struct cftype cgroup_psi_files[] = {
        {
                .name = "cpu.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
-               .open = cgroup_pressure_open,
                .seq_show = cgroup_cpu_pressure_show,
                .write = cgroup_cpu_pressure_write,
                .poll = cgroup_pressure_poll,
@@ -5327,7 +5316,6 @@ static struct cftype cgroup_psi_files[] = {
        {
                .name = "irq.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
-               .open = cgroup_pressure_open,
                .seq_show = cgroup_irq_pressure_show,
                .write = cgroup_irq_pressure_write,
                .poll = cgroup_pressure_poll,
index 9e4c678..a86972a 100644 (file)
@@ -2113,7 +2113,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
        [CPUHP_HRTIMERS_PREPARE] = {
                .name                   = "hrtimers:prepare",
                .startup.single         = hrtimers_prepare_cpu,
-               .teardown.single        = hrtimers_dead_cpu,
+               .teardown.single        = NULL,
        },
        [CPUHP_SMPCFD_PREPARE] = {
                .name                   = "smpcfd:prepare",
@@ -2205,6 +2205,12 @@ static struct cpuhp_step cpuhp_hp_states[] = {
                .startup.single         = NULL,
                .teardown.single        = smpcfd_dying_cpu,
        },
+       [CPUHP_AP_HRTIMERS_DYING] = {
+               .name                   = "hrtimers:dying",
+               .startup.single         = NULL,
+               .teardown.single        = hrtimers_cpu_dying,
+       },
+
        /* Entry state on starting. Interrupts enabled from here on. Transient
         * state for synchronsization */
        [CPUHP_AP_ONLINE] = {
index 683dc08..b704d83 100644 (file)
@@ -4828,6 +4828,11 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
        void *task_ctx_data = NULL;
 
        if (!ctx->task) {
+               /*
+                * perf_pmu_migrate_context() / __perf_pmu_install_event()
+                * relies on the fact that find_get_pmu_context() cannot fail
+                * for CPU contexts.
+                */
                struct perf_cpu_pmu_context *cpc;
 
                cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
@@ -12889,6 +12894,9 @@ static void __perf_pmu_install_event(struct pmu *pmu,
                                     int cpu, struct perf_event *event)
 {
        struct perf_event_pmu_context *epc;
+       struct perf_event_context *old_ctx = event->ctx;
+
+       get_ctx(ctx); /* normally find_get_context() */
 
        event->cpu = cpu;
        epc = find_get_pmu_context(pmu, ctx, event);
@@ -12897,6 +12905,11 @@ static void __perf_pmu_install_event(struct pmu *pmu,
        if (event->state >= PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_INACTIVE;
        perf_install_in_context(ctx, event, cpu);
+
+       /*
+        * Now that event->ctx is updated and visible, put the old ctx.
+        */
+       put_ctx(old_ctx);
 }
 
 static void __perf_pmu_install(struct perf_event_context *ctx,
@@ -12935,6 +12948,10 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
        struct perf_event_context *src_ctx, *dst_ctx;
        LIST_HEAD(events);
 
+       /*
+        * Since per-cpu context is persistent, no need to grab an extra
+        * reference.
+        */
        src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
        dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
 
index 52695c5..dad981a 100644 (file)
@@ -700,7 +700,8 @@ retry:
        owner = uval & FUTEX_TID_MASK;
 
        if (pending_op && !pi && !owner) {
-               futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+               futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+                          FUTEX_BITSET_MATCH_ANY);
                return 0;
        }
 
@@ -752,8 +753,10 @@ retry:
         * Wake robust non-PI futexes here. The wakeup of
         * PI futexes happens in exit_pi_state():
         */
-       if (!pi && (uval & FUTEX_WAITERS))
-               futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+       if (!pi && (uval & FUTEX_WAITERS)) {
+               futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+                          FUTEX_BITSET_MATCH_ANY);
+       }
 
        return 0;
 }
index 2048138..d7a3c63 100644 (file)
@@ -3666,41 +3666,140 @@ static inline void
 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
+static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                          unsigned long weight)
+{
+       unsigned long old_weight = se->load.weight;
+       u64 avruntime = avg_vruntime(cfs_rq);
+       s64 vlag, vslice;
+
+       /*
+        * VRUNTIME
+        * ========
+        *
+        * COROLLARY #1: The virtual runtime of the entity needs to be
+        * adjusted if re-weight at !0-lag point.
+        *
+        * Proof: For contradiction assume this is not true, so we can
+        * re-weight without changing vruntime at !0-lag point.
+        *
+        *             Weight   VRuntime   Avg-VRuntime
+        *     before    w          v            V
+        *      after    w'         v'           V'
+        *
+        * Since lag needs to be preserved through re-weight:
+        *
+        *      lag = (V - v)*w = (V'- v')*w', where v = v'
+        *      ==>     V' = (V - v)*w/w' + v           (1)
+        *
+        * Let W be the total weight of the entities before reweight,
+        * since V' is the new weighted average of entities:
+        *
+        *      V' = (WV + w'v - wv) / (W + w' - w)     (2)
+        *
+        * by using (1) & (2) we obtain:
+        *
+        *      (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+        *      ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+        *      ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+        *      ==>     (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+        *
+        * Since we are doing at !0-lag point which means V != v, we
+        * can simplify (3):
+        *
+        *      ==>     W / (W + w' - w) = w / w'
+        *      ==>     Ww' = Ww + ww' - ww
+        *      ==>     W * (w' - w) = w * (w' - w)
+        *      ==>     W = w   (re-weight indicates w' != w)
+        *
+        * So the cfs_rq contains only one entity, hence vruntime of
+        * the entity @v should always equal to the cfs_rq's weighted
+        * average vruntime @V, which means we will always re-weight
+        * at 0-lag point, thus breach assumption. Proof completed.
+        *
+        *
+        * COROLLARY #2: Re-weight does NOT affect weighted average
+        * vruntime of all the entities.
+        *
+        * Proof: According to corollary #1, Eq. (1) should be:
+        *
+        *      (V - v)*w = (V' - v')*w'
+        *      ==>    v' = V' - (V - v)*w/w'           (4)
+        *
+        * According to the weighted average formula, we have:
+        *
+        *      V' = (WV - wv + w'v') / (W - w + w')
+        *         = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+        *         = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+        *         = (WV + w'V' - Vw) / (W - w + w')
+        *
+        *      ==>  V'*(W - w + w') = WV + w'V' - Vw
+        *      ==>     V' * (W - w) = (W - w) * V      (5)
+        *
+        * If the entity is the only one in the cfs_rq, then reweight
+        * always occurs at 0-lag point, so V won't change. Or else
+        * there are other entities, hence W != w, then Eq. (5) turns
+        * into V' = V. So V won't change in either case, proof done.
+        *
+        *
+        * So according to corollary #1 & #2, the effect of re-weight
+        * on vruntime should be:
+        *
+        *      v' = V' - (V - v) * w / w'              (4)
+        *         = V  - (V - v) * w / w'
+        *         = V  - vl * w / w'
+        *         = V  - vl'
+        */
+       if (avruntime != se->vruntime) {
+               vlag = (s64)(avruntime - se->vruntime);
+               vlag = div_s64(vlag * old_weight, weight);
+               se->vruntime = avruntime - vlag;
+       }
+
+       /*
+        * DEADLINE
+        * ========
+        *
+        * When the weight changes, the virtual time slope changes and
+        * we should adjust the relative virtual deadline accordingly.
+        *
+        *      d' = v' + (d - v)*w/w'
+        *         = V' - (V - v)*w/w' + (d - v)*w/w'
+        *         = V  - (V - v)*w/w' + (d - v)*w/w'
+        *         = V  + (d - V)*w/w'
+        */
+       vslice = (s64)(se->deadline - avruntime);
+       vslice = div_s64(vslice * old_weight, weight);
+       se->deadline = avruntime + vslice;
+}
+
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
 {
-       unsigned long old_weight = se->load.weight;
+       bool curr = cfs_rq->curr == se;
 
        if (se->on_rq) {
                /* commit outstanding execution time */
-               if (cfs_rq->curr == se)
+               if (curr)
                        update_curr(cfs_rq);
                else
-                       avg_vruntime_sub(cfs_rq, se);
+                       __dequeue_entity(cfs_rq, se);
                update_load_sub(&cfs_rq->load, se->load.weight);
        }
        dequeue_load_avg(cfs_rq, se);
 
-       update_load_set(&se->load, weight);
-
        if (!se->on_rq) {
                /*
                 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
                 * we need to scale se->vlag when w_i changes.
                 */
-               se->vlag = div_s64(se->vlag * old_weight, weight);
+               se->vlag = div_s64(se->vlag * se->load.weight, weight);
        } else {
-               s64 deadline = se->deadline - se->vruntime;
-               /*
-                * When the weight changes, the virtual time slope changes and
-                * we should adjust the relative virtual deadline accordingly.
-                */
-               deadline = div_s64(deadline * old_weight, weight);
-               se->deadline = se->vruntime + deadline;
-               if (se != cfs_rq->curr)
-                       min_deadline_cb_propagate(&se->run_node, NULL);
+               reweight_eevdf(cfs_rq, se, weight);
        }
 
+       update_load_set(&se->load, weight);
+
 #ifdef CONFIG_SMP
        do {
                u32 divider = get_pelt_divider(&se->avg);
@@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
        enqueue_load_avg(cfs_rq, se);
        if (se->on_rq) {
                update_load_add(&cfs_rq->load, se->load.weight);
-               if (cfs_rq->curr != se)
-                       avg_vruntime_add(cfs_rq, se);
+               if (!curr) {
+                       /*
+                        * The entity's vruntime has been adjusted, so let's check
+                        * whether the rq-wide min_vruntime needs updated too. Since
+                        * the calculations above require stable min_vruntime rather
+                        * than up-to-date one, we do the update at the end of the
+                        * reweight process.
+                        */
+                       __enqueue_entity(cfs_rq, se);
+                       update_min_vruntime(cfs_rq);
+               }
        }
 }
 
@@ -3857,14 +3965,11 @@ static void update_cfs_group(struct sched_entity *se)
 
 #ifndef CONFIG_SMP
        shares = READ_ONCE(gcfs_rq->tg->shares);
-
-       if (likely(se->load.weight == shares))
-               return;
 #else
-       shares   = calc_group_shares(gcfs_rq);
+       shares = calc_group_shares(gcfs_rq);
 #endif
-
-       reweight_entity(cfs_rq_of(se), se, shares);
+       if (unlikely(se->load.weight != shares))
+               reweight_entity(cfs_rq_of(se), se, shares);
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
@@ -11079,12 +11184,16 @@ static int should_we_balance(struct lb_env *env)
                        continue;
                }
 
-               /* Are we the first idle CPU? */
+               /*
+                * Are we the first idle core in a non-SMT domain or higher,
+                * or the first idle CPU in a SMT domain?
+                */
                return cpu == env->dst_cpu;
        }
 
-       if (idle_smt == env->dst_cpu)
-               return true;
+       /* Are we the first idle CPU with busy siblings? */
+       if (idle_smt != -1)
+               return idle_smt == env->dst_cpu;
 
        /* Are we the first CPU of this group ? */
        return group_balance_cpu(sg) == env->dst_cpu;
index 420d9cb..e219fcf 100644 (file)
@@ -2394,6 +2394,10 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
        if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
                return -EINVAL;
 
+       /* PARISC cannot allow mdwe as it needs writable stacks */
+       if (IS_ENABLED(CONFIG_PARISC))
+               return -EINVAL;
+
        current_bits = get_current_mdwe();
        if (current_bits && current_bits != bits)
                return -EPERM; /* Cannot unset the flags */
index 238262e..7607939 100644 (file)
@@ -2219,29 +2219,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
        }
 }
 
-int hrtimers_dead_cpu(unsigned int scpu)
+int hrtimers_cpu_dying(unsigned int dying_cpu)
 {
        struct hrtimer_cpu_base *old_base, *new_base;
-       int i;
+       int i, ncpu = cpumask_first(cpu_active_mask);
 
-       BUG_ON(cpu_online(scpu));
-       tick_cancel_sched_timer(scpu);
+       tick_cancel_sched_timer(dying_cpu);
+
+       old_base = this_cpu_ptr(&hrtimer_bases);
+       new_base = &per_cpu(hrtimer_bases, ncpu);
 
-       /*
-        * this BH disable ensures that raise_softirq_irqoff() does
-        * not wakeup ksoftirqd (and acquire the pi-lock) while
-        * holding the cpu_base lock
-        */
-       local_bh_disable();
-       local_irq_disable();
-       old_base = &per_cpu(hrtimer_bases, scpu);
-       new_base = this_cpu_ptr(&hrtimer_bases);
        /*
         * The caller is globally serialized and nobody else
         * takes two locks at once, deadlock is not possible.
         */
-       raw_spin_lock(&new_base->lock);
-       raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+       raw_spin_lock(&old_base->lock);
+       raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
 
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                migrate_hrtimer_list(&old_base->clock_base[i],
@@ -2252,15 +2245,13 @@ int hrtimers_dead_cpu(unsigned int scpu)
         * The migration might have changed the first expiring softirq
         * timer on this CPU. Update it.
         */
-       hrtimer_update_softirq_timer(new_base, false);
+       __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
+       /* Tell the other CPU to retrigger the next event */
+       smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
 
-       raw_spin_unlock(&old_base->lock);
        raw_spin_unlock(&new_base->lock);
+       raw_spin_unlock(&old_base->lock);
 
-       /* Check, if we got expired work to do */
-       __hrtimer_peek_ahead_timers();
-       local_irq_enable();
-       local_bh_enable();
        return 0;
 }
 
index a0d0609..8dcb8ca 100644 (file)
@@ -312,7 +312,7 @@ size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size
 
 typedef struct {
     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
-    FSE_DTable dtable[1]; /* Dynamically sized */
+    FSE_DTable dtable[]; /* Dynamically sized */
 } FSE_DecompressWksp;
 
 
index 630077d..6262d55 100644 (file)
@@ -924,7 +924,7 @@ static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
                matched = true;
                break;
        default:
-               break;
+               return false;
        }
 
        return matched == filter->matching;
index 45bd0fd..be66723 100644 (file)
@@ -162,6 +162,9 @@ damon_sysfs_scheme_regions_alloc(void)
        struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions),
                        GFP_KERNEL);
 
+       if (!regions)
+               return NULL;
+
        regions->kobj = (struct kobject){};
        INIT_LIST_HEAD(&regions->regions_list);
        regions->nr_regions = 0;
@@ -1823,6 +1826,8 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
                return 0;
 
        region = damon_sysfs_scheme_region_alloc(r);
+       if (!region)
+               return 0;
        list_add_tail(&region->list, &sysfs_regions->regions_list);
        sysfs_regions->nr_regions++;
        if (kobject_init_and_add(&region->kobj,
index e278467..7472404 100644 (file)
@@ -1172,7 +1172,7 @@ static int damon_sysfs_update_target(struct damon_target *target,
                struct damon_ctx *ctx,
                struct damon_sysfs_target *sys_target)
 {
-       int err;
+       int err = 0;
 
        if (damon_target_has_pid(ctx)) {
                err = damon_sysfs_update_target_pid(target, sys_target->pid);
@@ -1203,8 +1203,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 
        damon_for_each_target_safe(t, next, ctx) {
                if (i < sysfs_targets->nr) {
-                       damon_sysfs_update_target(t, ctx,
+                       err = damon_sysfs_update_target(t, ctx,
                                        sysfs_targets->targets_arr[i]);
+                       if (err)
+                               return err;
                } else {
                        if (damon_target_has_pid(ctx))
                                put_pid(t->pid);
index 9710f43..32eedf3 100644 (file)
@@ -3443,7 +3443,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
                 * handled in the specific fault path, and it'll prohibit the
                 * fault-around logic.
                 */
-               if (!pte_none(vmf->pte[count]))
+               if (!pte_none(ptep_get(&vmf->pte[count])))
                        goto skip;
 
                count++;
index f31f024..4f54244 100644 (file)
@@ -2769,13 +2769,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                        int nr = folio_nr_pages(folio);
 
                        xas_split(&xas, folio, folio_order(folio));
-                       if (folio_test_swapbacked(folio)) {
-                               __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS,
-                                                       -nr);
-                       } else {
-                               __lruvec_stat_mod_folio(folio, NR_FILE_THPS,
-                                                       -nr);
-                               filemap_nr_thps_dec(mapping);
+                       if (folio_test_pmd_mappable(folio)) {
+                               if (folio_test_swapbacked(folio)) {
+                                       __lruvec_stat_mod_folio(folio,
+                                                       NR_SHMEM_THPS, -nr);
+                               } else {
+                                       __lruvec_stat_mod_folio(folio,
+                                                       NR_FILE_THPS, -nr);
+                                       filemap_nr_thps_dec(mapping);
+                               }
                        }
                }
 
index 7efcc68..6a83100 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -468,7 +468,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
                        page = pfn_swap_entry_to_page(entry);
        }
        /* return 1 if the page is an normal ksm page or KSM-placed zero page */
-       ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte);
+       ret = (page && PageKsm(page)) || is_ksm_zero_pte(ptent);
        pte_unmap_unlock(pte, ptl);
        return ret;
 }
index 774bd6e..1c1061d 100644 (file)
@@ -2936,7 +2936,8 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
  * Moreover, it should not come from DMA buffer and is not readily
  * reclaimable. So those GFP bits should be masked off.
  */
-#define OBJCGS_CLEAR_MASK      (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
+#define OBJCGS_CLEAR_MASK      (__GFP_DMA | __GFP_RECLAIMABLE | \
+                                __GFP_ACCOUNT | __GFP_NOFAIL)
 
 /*
  * mod_objcg_mlstate() may be called with irq enabled, so
index 96d9eae..0b6ca55 100644 (file)
@@ -312,7 +312,7 @@ static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
 
        ret = -EEXIST;
        /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
-       if (!pte_none(*dst_pte))
+       if (!pte_none(ptep_get(dst_pte)))
                goto out_unlock;
 
        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
index aa01f6e..744b4d7 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -414,6 +414,15 @@ static int mmap_is_legacy(struct rlimit *rlim_stack)
 
 static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
 {
+#ifdef CONFIG_STACK_GROWSUP
+       /*
+        * For an upwards growing stack the calculation is much simpler.
+        * Memory for the maximum stack size is reserved at the top of the
+        * task. mmap_base starts directly below the stack and grows
+        * downwards.
+        */
+       return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
+#else
        unsigned long gap = rlim_stack->rlim_cur;
        unsigned long pad = stack_guard_gap;
 
@@ -431,6 +440,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
                gap = MAX_GAP;
 
        return PAGE_ALIGN(STACK_TOP - gap - rnd);
+#endif
 }
 
 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
index b5c406a..abb090f 100644 (file)
@@ -37,7 +37,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
        ktime_t tstamp = skb->tstamp;
        struct ip_frag_state state;
        struct iphdr *iph;
-       int err;
+       int err = 0;
 
        /* for offloaded checksums cleanup checksum before fragmentation */
        if (skb->ip_summed == CHECKSUM_PARTIAL &&
index 0d54843..af53f6d 100644 (file)
@@ -1119,7 +1119,9 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res)
        if (i == max_netdevices)
                return -ENFILE;
 
-       snprintf(res, IFNAMSIZ, name, i);
+       /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
+       strscpy(buf, name, IFNAMSIZ);
+       snprintf(res, IFNAMSIZ, buf, i);
        return i;
 }
 
index ceb684b..4c2e77b 100644 (file)
@@ -180,18 +180,17 @@ static void gso_test_func(struct kunit *test)
        }
 
        if (tcase->frag_skbs) {
-               unsigned int total_size = 0, total_true_size = 0, alloc_size = 0;
+               unsigned int total_size = 0, total_true_size = 0;
                struct sk_buff *frag_skb, *prev = NULL;
 
-               page = alloc_page(GFP_KERNEL);
-               KUNIT_ASSERT_NOT_NULL(test, page);
-               page_ref_add(page, tcase->nr_frag_skbs - 1);
-
                for (i = 0; i < tcase->nr_frag_skbs; i++) {
                        unsigned int frag_size;
 
+                       page = alloc_page(GFP_KERNEL);
+                       KUNIT_ASSERT_NOT_NULL(test, page);
+
                        frag_size = tcase->frag_skbs[i];
-                       frag_skb = build_skb(page_address(page) + alloc_size,
+                       frag_skb = build_skb(page_address(page),
                                             frag_size + shinfo_size);
                        KUNIT_ASSERT_NOT_NULL(test, frag_skb);
                        __skb_put(frag_skb, frag_size);
@@ -204,11 +203,8 @@ static void gso_test_func(struct kunit *test)
 
                        total_size += frag_size;
                        total_true_size += frag_skb->truesize;
-                       alloc_size += frag_size + shinfo_size;
                }
 
-               KUNIT_ASSERT_LE(test, alloc_size, PAGE_SIZE);
-
                skb->len += total_size;
                skb->data_len += total_size;
                skb->truesize += total_true_size;
index 598c1b1..a532f74 100644 (file)
@@ -751,12 +751,12 @@ int __inet_hash(struct sock *sk, struct sock *osk)
                if (err)
                        goto unlock;
        }
+       sock_set_flag(sk, SOCK_RCU_FREE);
        if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
                sk->sk_family == AF_INET6)
                __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
        else
                __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
-       sock_set_flag(sk, SOCK_RCU_FREE);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 unlock:
        spin_unlock(&ilb2->lock);
index 1529ec3..bf4d96f 100644 (file)
@@ -1515,8 +1515,9 @@ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list)
        struct mptcp_pm_addr_entry *entry;
 
        list_for_each_entry(entry, rm_list, list) {
-               remove_anno_list_by_saddr(msk, &entry->addr);
-               if (alist.nr < MPTCP_RM_IDS_MAX)
+               if ((remove_anno_list_by_saddr(msk, &entry->addr) ||
+                    lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) &&
+                   alist.nr < MPTCP_RM_IDS_MAX)
                        alist.ids[alist.nr++] = entry->addr.id;
        }
 
index a0b8356..bc81ea5 100644 (file)
@@ -1230,6 +1230,8 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk,
        mptcp_do_fallback(ssk);
 }
 
+#define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1))
+
 static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
                              struct mptcp_data_frag *dfrag,
                              struct mptcp_sendmsg_info *info)
@@ -1256,6 +1258,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
                return -EAGAIN;
 
        /* compute send limit */
+       if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE))
+               ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE;
        info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
        copy = info->size_goal;
 
@@ -3398,10 +3402,11 @@ static void mptcp_release_cb(struct sock *sk)
        if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
                __mptcp_clean_una_wakeup(sk);
        if (unlikely(msk->cb_flags)) {
-               /* be sure to set the current sk state before tacking actions
-                * depending on sk_state, that is processing MPTCP_ERROR_REPORT
+               /* be sure to set the current sk state before taking actions
+                * depending on sk_state (MPTCP_ERROR_REPORT)
+                * On sk release avoid actions depending on the first subflow
                 */
-               if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags))
+               if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags) && msk->first)
                        __mptcp_set_connected(sk);
                if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
                        __mptcp_error_report(sk);
index 77f5e89..3536807 100644 (file)
@@ -738,8 +738,11 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
        val = READ_ONCE(inet_sk(sk)->tos);
        mptcp_for_each_subflow(msk, subflow) {
                struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+               bool slow;
 
+               slow = lock_sock_fast(ssk);
                __ip_sock_set_tos(ssk, val);
+               unlock_sock_fast(ssk, slow);
        }
        release_sock(sk);
 
index f8854bf..62fb103 100644 (file)
@@ -89,11 +89,6 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
        if ((had_link == has_link) || chained)
                return 0;
 
-       if (had_link)
-               netif_carrier_off(ndp->ndev.dev);
-       else
-               netif_carrier_on(ndp->ndev.dev);
-
        if (!ndp->multi_package && !nc->package->multi_channel) {
                if (had_link) {
                        ndp->flags |= NCSI_DEV_RESHUFFLE;
index 35d2f9c..4c133e0 100644 (file)
@@ -61,6 +61,8 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
        ip_set_dereference((inst)->ip_set_list)[id]
 #define ip_set_ref_netlink(inst,id)    \
        rcu_dereference_raw((inst)->ip_set_list)[id]
+#define ip_set_dereference_nfnl(p)     \
+       rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
 
 /* The set types are implemented in modules and registered set types
  * can be found in ip_set_type_list. Adding/deleting types is
@@ -708,15 +710,10 @@ __ip_set_put_netlink(struct ip_set *set)
 static struct ip_set *
 ip_set_rcu_get(struct net *net, ip_set_id_t index)
 {
-       struct ip_set *set;
        struct ip_set_net *inst = ip_set_pernet(net);
 
-       rcu_read_lock();
-       /* ip_set_list itself needs to be protected */
-       set = rcu_dereference(inst->ip_set_list)[index];
-       rcu_read_unlock();
-
-       return set;
+       /* ip_set_list and the set pointer need to be protected */
+       return ip_set_dereference_nfnl(inst->ip_set_list)[index];
 }
 
 static inline void
@@ -1397,6 +1394,9 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
        ip_set(inst, to_id) = from;
        write_unlock_bh(&ip_set_ref_lock);
 
+       /* Make sure all readers of the old set pointers are completed. */
+       synchronize_rcu();
+
        return 0;
 }
 
index a761ee6..c0a4298 100644 (file)
@@ -7263,10 +7263,11 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
 
                if (err < 0) {
                        NL_SET_BAD_ATTR(extack, attr);
-                       break;
+                       return err;
                }
        }
-       return err;
+
+       return 0;
 }
 
 /*
@@ -9679,16 +9680,14 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans)
        call_rcu(&trans->rcu, nft_trans_gc_trans_free);
 }
 
-static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
-                                                 unsigned int gc_seq,
-                                                 bool sync)
+struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
+                                                unsigned int gc_seq)
 {
-       struct nft_set_elem_catchall *catchall, *next;
+       struct nft_set_elem_catchall *catchall;
        const struct nft_set *set = gc->set;
-       struct nft_elem_priv *elem_priv;
        struct nft_set_ext *ext;
 
-       list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+       list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
 
                if (!nft_set_elem_expired(ext))
@@ -9698,35 +9697,42 @@ static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
 
                nft_set_elem_dead(ext);
 dead_elem:
-               if (sync)
-                       gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
-               else
-                       gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
-
+               gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
                if (!gc)
                        return NULL;
 
-               elem_priv = catchall->elem;
-               if (sync) {
-                       nft_setelem_data_deactivate(gc->net, gc->set, elem_priv);
-                       nft_setelem_catchall_destroy(catchall);
-               }
-
-               nft_trans_gc_elem_add(gc, elem_priv);
+               nft_trans_gc_elem_add(gc, catchall->elem);
        }
 
        return gc;
 }
 
-struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
-                                                unsigned int gc_seq)
-{
-       return nft_trans_gc_catchall(gc, gc_seq, false);
-}
-
 struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc)
 {
-       return nft_trans_gc_catchall(gc, 0, true);
+       struct nft_set_elem_catchall *catchall, *next;
+       const struct nft_set *set = gc->set;
+       struct nft_elem_priv *elem_priv;
+       struct nft_set_ext *ext;
+
+       WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net));
+
+       list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+               ext = nft_set_elem_ext(set, catchall->elem);
+
+               if (!nft_set_elem_expired(ext))
+                       continue;
+
+               gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+               if (!gc)
+                       return NULL;
+
+               elem_priv = catchall->elem;
+               nft_setelem_data_deactivate(gc->net, gc->set, elem_priv);
+               nft_setelem_catchall_destroy(catchall);
+               nft_trans_gc_elem_add(gc, elem_priv);
+       }
+
+       return gc;
 }
 
 static void nf_tables_module_autoload_cleanup(struct net *net)
index e596d1a..f6e791a 100644 (file)
@@ -38,13 +38,14 @@ void nft_byteorder_eval(const struct nft_expr *expr,
 
        switch (priv->size) {
        case 8: {
+               u64 *dst64 = (void *)dst;
                u64 src64;
 
                switch (priv->op) {
                case NFT_BYTEORDER_NTOH:
                        for (i = 0; i < priv->len / 8; i++) {
                                src64 = nft_reg_load64(&src[i]);
-                               nft_reg_store64(&dst[i],
+                               nft_reg_store64(&dst64[i],
                                                be64_to_cpu((__force __be64)src64));
                        }
                        break;
@@ -52,7 +53,7 @@ void nft_byteorder_eval(const struct nft_expr *expr,
                        for (i = 0; i < priv->len / 8; i++) {
                                src64 = (__force __u64)
                                        cpu_to_be64(nft_reg_load64(&src[i]));
-                               nft_reg_store64(&dst[i], src64);
+                               nft_reg_store64(&dst64[i], src64);
                        }
                        break;
                }
index f7da7c4..ba0d368 100644 (file)
@@ -63,7 +63,7 @@ nft_meta_get_eval_time(enum nft_meta_keys key,
 {
        switch (key) {
        case NFT_META_TIME_NS:
-               nft_reg_store64(dest, ktime_get_real_ns());
+               nft_reg_store64((u64 *)dest, ktime_get_real_ns());
                break;
        case NFT_META_TIME_DAY:
                nft_reg_store8(dest, nft_meta_weekday());
index 6f1186a..baa3fea 100644 (file)
@@ -624,14 +624,12 @@ static void nft_rbtree_gc(struct nft_set *set)
 {
        struct nft_rbtree *priv = nft_set_priv(set);
        struct nft_rbtree_elem *rbe, *rbe_end = NULL;
-       struct nftables_pernet *nft_net;
        struct rb_node *node, *next;
        struct nft_trans_gc *gc;
        struct net *net;
 
        set  = nft_set_container_of(priv);
        net  = read_pnet(&set->net);
-       nft_net = nft_pernet(net);
 
        gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
        if (!gc)
index 0db0ecf..b3f4a50 100644 (file)
@@ -1549,6 +1549,9 @@ static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data,
        if (bind) {
                struct flow_action_entry *entry = entry_data;
 
+               if (tcf_ct_helper(act))
+                       return -EOPNOTSUPP;
+
                entry->id = FLOW_ACTION_CT;
                entry->ct.action = tcf_ct_action(act);
                entry->ct.zone = tcf_ct_zone(act);
index 5bc076f..c763008 100644 (file)
@@ -102,6 +102,7 @@ static int tipc_add_tlv(struct sk_buff *skb, u16 type, void *data, u16 len)
                return -EMSGSIZE;
 
        skb_put(skb, TLV_SPACE(len));
+       memset(tlv, 0, TLV_SPACE(len));
        tlv->tlv_type = htons(type);
        tlv->tlv_len = htons(TLV_LENGTH(len));
        if (len && data)
index 45506a9..a357dc5 100644 (file)
@@ -2581,15 +2581,16 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state)
 
        if (!(state->flags & MSG_PEEK))
                WRITE_ONCE(u->oob_skb, NULL);
-
+       else
+               skb_get(oob_skb);
        unix_state_unlock(sk);
 
        chunk = state->recv_actor(oob_skb, 0, chunk, state);
 
-       if (!(state->flags & MSG_PEEK)) {
+       if (!(state->flags & MSG_PEEK))
                UNIXCB(oob_skb).consumed += 1;
-               kfree_skb(oob_skb);
-       }
+
+       consume_skb(oob_skb);
 
        mutex_unlock(&u->iolock);
 
index 39e86be..ff0b192 100644 (file)
@@ -17,7 +17,7 @@
  *     if (argc <= 1)
  *             printf("%s: no command arguments :(\n", *argv);
  *     else
- *             printf("%s: %d command arguments!\n", *argv, args - 1);
+ *             printf("%s: %d command arguments!\n", *argv, argc - 1);
  * }
  *
  * after:
@@ -47,7 +47,7 @@
  *             // perturb_local_entropy()
  *     } else {
  *             local_entropy ^= 3896280633962944730;
- *             printf("%s: %d command arguments!\n", *argv, args - 1);
+ *             printf("%s: %d command arguments!\n", *argv, argc - 1);
  *     }
  *
  *     // latent_entropy_execute() 4.
index 366395c..910bd21 100644 (file)
@@ -278,8 +278,6 @@ static bool is_flexible_array(const_tree field)
 {
        const_tree fieldtype;
        const_tree typesize;
-       const_tree elemtype;
-       const_tree elemsize;
 
        fieldtype = TREE_TYPE(field);
        typesize = TYPE_SIZE(fieldtype);
@@ -287,20 +285,12 @@ static bool is_flexible_array(const_tree field)
        if (TREE_CODE(fieldtype) != ARRAY_TYPE)
                return false;
 
-       elemtype = TREE_TYPE(fieldtype);
-       elemsize = TYPE_SIZE(elemtype);
-
        /* size of type is represented in bits */
 
        if (typesize == NULL_TREE && TYPE_DOMAIN(fieldtype) != NULL_TREE &&
            TYPE_MAX_VALUE(TYPE_DOMAIN(fieldtype)) == NULL_TREE)
                return true;
 
-       if (typesize != NULL_TREE &&
-           (TREE_CONSTANT(typesize) && (!tree_to_uhwi(typesize) ||
-            tree_to_uhwi(typesize) == tree_to_uhwi(elemsize))))
-               return true;
-
        return false;
 }
 
index 757a4d1..a9ef6d8 100644 (file)
@@ -21,6 +21,10 @@ static int cs35l56_hda_i2c_probe(struct i2c_client *clt)
                return -ENOMEM;
 
        cs35l56->base.dev = &clt->dev;
+
+#ifdef CS35L56_WAKE_HOLD_TIME_US
+       cs35l56->base.can_hibernate = true;
+#endif
        cs35l56->base.regmap = devm_regmap_init_i2c(clt, &cs35l56_regmap_i2c);
        if (IS_ERR(cs35l56->base.regmap)) {
                ret = PTR_ERR(cs35l56->base.regmap);
index 0326491..db90feb 100644 (file)
@@ -2135,6 +2135,9 @@ static int azx_probe(struct pci_dev *pci,
        if (chip->driver_caps & AZX_DCAPS_I915_COMPONENT) {
                err = snd_hdac_i915_init(azx_bus(chip));
                if (err < 0) {
+                       if (err == -EPROBE_DEFER)
+                               goto out_free;
+
                        /* if the controller is bound only with HDMI/DP
                         * (for HSW and BDW), we need to abort the probe;
                         * for other chips, still continue probing as other
index 669ae3d..5618b1d 100644 (file)
@@ -9832,6 +9832,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x103c, 0x8898, "HP EliteBook 845 G8 Notebook PC", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST),
        SND_PCI_QUIRK(0x103c, 0x88d0, "HP Pavilion 15-eh1xxx (mainboard 88D0)", ALC287_FIXUP_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x8902, "HP OMEN 16", ALC285_FIXUP_HP_MUTE_LED),
+       SND_PCI_QUIRK(0x103c, 0x890e, "HP 255 G8 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2),
        SND_PCI_QUIRK(0x103c, 0x8919, "HP Pavilion Aero Laptop 13-be0xxx", ALC287_FIXUP_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x896d, "HP ZBook Firefly 16 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x896e, "HP EliteBook x360 830 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
@@ -9867,6 +9868,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x103c, 0x8abb, "HP ZBook Firefly 14 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x8ad1, "HP EliteBook 840 14 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x8ad2, "HP EliteBook 860 16 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
+       SND_PCI_QUIRK(0x103c, 0x8b2f, "HP 255 15.6 inch G10 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2),
        SND_PCI_QUIRK(0x103c, 0x8b42, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x8b43, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x8b44, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
@@ -9900,12 +9902,16 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x103c, 0x8c70, "HP EliteBook 835 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x8c71, "HP EliteBook 845 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED),
        SND_PCI_QUIRK(0x103c, 0x8c72, "HP EliteBook 865 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED),
+       SND_PCI_QUIRK(0x103c, 0x8ca4, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
+       SND_PCI_QUIRK(0x103c, 0x8ca7, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
+       SND_PCI_QUIRK(0x103c, 0x8cf5, "HP ZBook Studio 16", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED),
        SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC),
        SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300),
        SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
        SND_PCI_QUIRK(0x1043, 0x10a1, "ASUS UX391UA", ALC294_FIXUP_ASUS_SPK),
        SND_PCI_QUIRK(0x1043, 0x10c0, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC),
        SND_PCI_QUIRK(0x1043, 0x10d0, "ASUS X540LA/X540LJ", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE),
+       SND_PCI_QUIRK(0x1043, 0x10d3, "ASUS K6500ZC", ALC294_FIXUP_ASUS_SPK),
        SND_PCI_QUIRK(0x1043, 0x115d, "Asus 1015E", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
        SND_PCI_QUIRK(0x1043, 0x11c0, "ASUS X556UR", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1043, 0x125e, "ASUS Q524UQK", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE),
@@ -9944,13 +9950,17 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1043, 0x19e1, "ASUS UX581LV", ALC295_FIXUP_ASUS_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1043, 0x1a13, "Asus G73Jw", ALC269_FIXUP_ASUS_G73JW),
        SND_PCI_QUIRK(0x1043, 0x1a30, "ASUS X705UD", ALC256_FIXUP_ASUS_MIC),
+       SND_PCI_QUIRK(0x1043, 0x1a63, "ASUS UX3405MA", ALC245_FIXUP_CS35L41_SPI_2),
        SND_PCI_QUIRK(0x1043, 0x1a83, "ASUS UM5302LA", ALC294_FIXUP_CS35L41_I2C_2),
        SND_PCI_QUIRK(0x1043, 0x1a8f, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2),
        SND_PCI_QUIRK(0x1043, 0x1b11, "ASUS UX431DA", ALC294_FIXUP_ASUS_COEF_1B),
        SND_PCI_QUIRK(0x1043, 0x1b13, "Asus U41SV", ALC269_FIXUP_INV_DMIC),
        SND_PCI_QUIRK(0x1043, 0x1b93, "ASUS G614JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2),
        SND_PCI_QUIRK(0x1043, 0x1bbd, "ASUS Z550MA", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE),
+       SND_PCI_QUIRK(0x1043, 0x1c03, "ASUS UM3406HA", ALC287_FIXUP_CS35L41_I2C_2),
        SND_PCI_QUIRK(0x1043, 0x1c23, "Asus X55U", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
+       SND_PCI_QUIRK(0x1043, 0x1c33, "ASUS UX5304MA", ALC245_FIXUP_CS35L41_SPI_2),
+       SND_PCI_QUIRK(0x1043, 0x1c43, "ASUS UX8406MA", ALC245_FIXUP_CS35L41_SPI_2),
        SND_PCI_QUIRK(0x1043, 0x1c62, "ASUS GU603", ALC289_FIXUP_ASUS_GA401),
        SND_PCI_QUIRK(0x1043, 0x1c92, "ASUS ROG Strix G15", ALC285_FIXUP_ASUS_G533Z_PINS),
        SND_PCI_QUIRK(0x1043, 0x1c9f, "ASUS G614JI", ALC285_FIXUP_ASUS_HEADSET_MIC),
@@ -10821,22 +10831,6 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
                {0x12, 0x90a60130},
                {0x17, 0x90170110},
                {0x21, 0x03211020}),
-       SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
-               {0x14, 0x90170110},
-               {0x21, 0x04211020}),
-       SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
-               {0x14, 0x90170110},
-               {0x21, 0x04211030}),
-       SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE,
-               ALC295_STANDARD_PINS,
-               {0x17, 0x21014020},
-               {0x18, 0x21a19030}),
-       SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE,
-               ALC295_STANDARD_PINS,
-               {0x17, 0x21014040},
-               {0x18, 0x21a19050}),
-       SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE,
-               ALC295_STANDARD_PINS),
        SND_HDA_PIN_QUIRK(0x10ec0298, 0x1028, "Dell", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE,
                ALC298_STANDARD_PINS,
                {0x17, 0x90170110}),
@@ -10880,6 +10874,9 @@ static const struct snd_hda_pin_quirk alc269_fallback_pin_fixup_tbl[] = {
        SND_HDA_PIN_QUIRK(0x10ec0289, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
                {0x19, 0x40000000},
                {0x1b, 0x40000000}),
+       SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
+               {0x19, 0x40000000},
+               {0x1b, 0x40000000}),
        SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
                {0x19, 0x40000000},
                {0x1a, 0x40000000}),
index 9a10512..7a33437 100644 (file)
@@ -211,9 +211,6 @@ int *fd_instr_count_percpu;
 struct timeval interval_tv = { 5, 0 };
 struct timespec interval_ts = { 5, 0 };
 
-/* Save original CPU model */
-unsigned int model_orig;
-
 unsigned int num_iterations;
 unsigned int header_iterations;
 unsigned int debug;
@@ -224,24 +221,16 @@ unsigned int rapl_joules;
 unsigned int summary_only;
 unsigned int list_header_only;
 unsigned int dump_only;
-unsigned int do_snb_cstates;
-unsigned int do_knl_cstates;
-unsigned int do_slm_cstates;
-unsigned int use_c1_residency_msr;
 unsigned int has_aperf;
 unsigned int has_epb;
 unsigned int has_turbo;
 unsigned int is_hybrid;
-unsigned int do_irtl_snb;
-unsigned int do_irtl_hsw;
 unsigned int units = 1000000;  /* MHz etc */
 unsigned int genuine_intel;
 unsigned int authentic_amd;
 unsigned int hygon_genuine;
 unsigned int max_level, max_extended_level;
 unsigned int has_invariant_tsc;
-unsigned int do_nhm_platform_info;
-unsigned int no_MSR_MISC_PWR_MGMT;
 unsigned int aperf_mperf_multiplier = 1;
 double bclk;
 double base_hz;
@@ -250,7 +239,6 @@ double tsc_tweak = 1.0;
 unsigned int show_pkg_only;
 unsigned int show_core_only;
 char *output_buffer, *outp;
-unsigned int do_rapl;
 unsigned int do_dts;
 unsigned int do_ptm;
 unsigned int do_ipc;
@@ -261,65 +249,686 @@ unsigned int gfx_cur_mhz;
 unsigned int gfx_act_mhz;
 unsigned int tj_max;
 unsigned int tj_max_override;
-int tcc_offset_bits;
 double rapl_power_units, rapl_time_units;
 double rapl_dram_energy_units, rapl_energy_units;
 double rapl_joule_counter_range;
-unsigned int do_core_perf_limit_reasons;
-unsigned int has_automatic_cstate_conversion;
-unsigned int dis_cstate_prewake;
-unsigned int do_gfx_perf_limit_reasons;
-unsigned int do_ring_perf_limit_reasons;
 unsigned int crystal_hz;
 unsigned long long tsc_hz;
 int base_cpu;
-double discover_bclk(unsigned int family, unsigned int model);
 unsigned int has_hwp;          /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */
                        /* IA32_HWP_REQUEST, IA32_HWP_STATUS */
 unsigned int has_hwp_notify;   /* IA32_HWP_INTERRUPT */
 unsigned int has_hwp_activity_window;  /* IA32_HWP_REQUEST[bits 41:32] */
 unsigned int has_hwp_epp;      /* IA32_HWP_REQUEST[bits 31:24] */
 unsigned int has_hwp_pkg;      /* IA32_HWP_REQUEST_PKG */
-unsigned int has_misc_feature_control;
 unsigned int first_counter_read = 1;
 int ignore_stdin;
 
-#define RAPL_PKG               (1 << 0)
-                                       /* 0x610 MSR_PKG_POWER_LIMIT */
-                                       /* 0x611 MSR_PKG_ENERGY_STATUS */
-#define RAPL_PKG_PERF_STATUS   (1 << 1)
-                                       /* 0x613 MSR_PKG_PERF_STATUS */
-#define RAPL_PKG_POWER_INFO    (1 << 2)
-                                       /* 0x614 MSR_PKG_POWER_INFO */
-
-#define RAPL_DRAM              (1 << 3)
-                                       /* 0x618 MSR_DRAM_POWER_LIMIT */
-                                       /* 0x619 MSR_DRAM_ENERGY_STATUS */
-#define RAPL_DRAM_PERF_STATUS  (1 << 4)
-                                       /* 0x61b MSR_DRAM_PERF_STATUS */
-#define RAPL_DRAM_POWER_INFO   (1 << 5)
-                                       /* 0x61c MSR_DRAM_POWER_INFO */
-
-#define RAPL_CORES_POWER_LIMIT (1 << 6)
-                                       /* 0x638 MSR_PP0_POWER_LIMIT */
-#define RAPL_CORE_POLICY       (1 << 7)
-                                       /* 0x63a MSR_PP0_POLICY */
-
-#define RAPL_GFX               (1 << 8)
-                                       /* 0x640 MSR_PP1_POWER_LIMIT */
-                                       /* 0x641 MSR_PP1_ENERGY_STATUS */
-                                       /* 0x642 MSR_PP1_POLICY */
-
-#define RAPL_CORES_ENERGY_STATUS       (1 << 9)
-                                       /* 0x639 MSR_PP0_ENERGY_STATUS */
-#define RAPL_PER_CORE_ENERGY   (1 << 10)
-                                       /* Indicates cores energy collection is per-core,
-                                        * not per-package. */
-#define RAPL_AMD_F17H          (1 << 11)
-                                       /* 0xc0010299 MSR_RAPL_PWR_UNIT */
-                                       /* 0xc001029a MSR_CORE_ENERGY_STAT */
-                                       /* 0xc001029b MSR_PKG_ENERGY_STAT */
-#define RAPL_CORES (RAPL_CORES_ENERGY_STATUS | RAPL_CORES_POWER_LIMIT)
+int get_msr(int cpu, off_t offset, unsigned long long *msr);
+
+/* Model specific support Start */
+
+/* List of features that may diverge among different platforms */
+struct platform_features {
+       bool has_msr_misc_feature_control;      /* MSR_MISC_FEATURE_CONTROL */
+       bool has_msr_misc_pwr_mgmt;     /* MSR_MISC_PWR_MGMT */
+       bool has_nhm_msrs;      /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, MSR_IA32_POWER_CTL, TRL MSRs */
+       bool has_config_tdp;    /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */
+       int bclk_freq;          /* CPU base clock */
+       int crystal_freq;       /* Crystal clock to use when not available from CPUID.15 */
+       int supported_cstates;  /* Core cstates and Package cstates supported */
+       int cst_limit;          /* MSR_PKG_CST_CONFIG_CONTROL */
+       bool has_cst_auto_convension;   /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
+       bool has_irtl_msrs;     /* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */
+       bool has_msr_core_c1_res;       /* MSR_CORE_C1_RES */
+       bool has_msr_module_c6_res_ms;  /* MSR_MODULE_C6_RES_MS */
+       bool has_msr_c6_demotion_policy_config; /* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */
+       bool has_msr_atom_pkg_c6_residency;     /* MSR_ATOM_PKG_C6_RESIDENCY */
+       bool has_msr_knl_core_c6_residency;     /* MSR_KNL_CORE_C6_RESIDENCY */
+       bool has_ext_cst_msrs;  /* MSR_PKG_WEIGHTED_CORE_C0_RES/MSR_PKG_ANY_CORE_C0_RES/MSR_PKG_ANY_GFXE_C0_RES/MSR_PKG_BOTH_CORE_GFXE_C0_RES */
+       bool has_cst_prewake_bit;       /* Cstate prewake bit in MSR_IA32_POWER_CTL */
+       int trl_msrs;           /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
+       int plr_msrs;           /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
+       int rapl_msrs;          /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
+       bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */
+       bool has_rapl_divisor;  /* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */
+       bool has_fixed_rapl_unit;       /* Fixed Energy Unit used for DRAM RAPL Domain */
+       int rapl_quirk_tdp;     /* Hardcoded TDP value when cannot be retrieved from hardware */
+       int tcc_offset_bits;    /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
+       bool enable_tsc_tweak;  /* Use CPU Base freq instead of TSC freq for aperf/mperf counter */
+       bool need_perf_multiplier;      /* mperf/aperf multiplier */
+};
+
+struct platform_data {
+       unsigned int model;
+       const struct platform_features *features;
+};
+
+/* For BCLK */
+enum bclk_freq {
+       BCLK_100MHZ = 1,
+       BCLK_133MHZ,
+       BCLK_SLV,
+};
+
+#define SLM_BCLK_FREQS 5
+double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 };
+
+double slm_bclk(void)
+{
+       unsigned long long msr = 3;
+       unsigned int i;
+       double freq;
+
+       if (get_msr(base_cpu, MSR_FSB_FREQ, &msr))
+               fprintf(outf, "SLM BCLK: unknown\n");
+
+       i = msr & 0xf;
+       if (i >= SLM_BCLK_FREQS) {
+               fprintf(outf, "SLM BCLK[%d] invalid\n", i);
+               i = 3;
+       }
+       freq = slm_freq_table[i];
+
+       if (!quiet)
+               fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq);
+
+       return freq;
+}
+
+/* For Package cstate limit */
+enum package_cstate_limit {
+       CST_LIMIT_NHM = 1,
+       CST_LIMIT_SNB,
+       CST_LIMIT_HSW,
+       CST_LIMIT_SKX,
+       CST_LIMIT_ICX,
+       CST_LIMIT_SLV,
+       CST_LIMIT_AMT,
+       CST_LIMIT_KNL,
+       CST_LIMIT_GMT,
+};
+
+/* For Turbo Ratio Limit MSRs */
+enum turbo_ratio_limit_msrs {
+       TRL_BASE = BIT(0),
+       TRL_LIMIT1 = BIT(1),
+       TRL_LIMIT2 = BIT(2),
+       TRL_ATOM = BIT(3),
+       TRL_KNL = BIT(4),
+       TRL_CORECOUNT = BIT(5),
+};
+
+/* For Perf Limit Reason MSRs */
+enum perf_limit_reason_msrs {
+       PLR_CORE = BIT(0),
+       PLR_GFX = BIT(1),
+       PLR_RING = BIT(2),
+};
+
+/* For RAPL MSRs */
+enum rapl_msrs {
+       RAPL_PKG_POWER_LIMIT = BIT(0),  /* 0x610 MSR_PKG_POWER_LIMIT */
+       RAPL_PKG_ENERGY_STATUS = BIT(1),        /* 0x611 MSR_PKG_ENERGY_STATUS */
+       RAPL_PKG_PERF_STATUS = BIT(2),  /* 0x613 MSR_PKG_PERF_STATUS */
+       RAPL_PKG_POWER_INFO = BIT(3),   /* 0x614 MSR_PKG_POWER_INFO */
+       RAPL_DRAM_POWER_LIMIT = BIT(4), /* 0x618 MSR_DRAM_POWER_LIMIT */
+       RAPL_DRAM_ENERGY_STATUS = BIT(5),       /* 0x619 MSR_DRAM_ENERGY_STATUS */
+       RAPL_DRAM_PERF_STATUS = BIT(6), /* 0x61b MSR_DRAM_PERF_STATUS */
+       RAPL_DRAM_POWER_INFO = BIT(7),  /* 0x61c MSR_DRAM_POWER_INFO */
+       RAPL_CORE_POWER_LIMIT = BIT(8), /* 0x638 MSR_PP0_POWER_LIMIT */
+       RAPL_CORE_ENERGY_STATUS = BIT(9),       /* 0x639 MSR_PP0_ENERGY_STATUS */
+       RAPL_CORE_POLICY = BIT(10),     /* 0x63a MSR_PP0_POLICY */
+       RAPL_GFX_POWER_LIMIT = BIT(11), /* 0x640 MSR_PP1_POWER_LIMIT */
+       RAPL_GFX_ENERGY_STATUS = BIT(12),       /* 0x641 MSR_PP1_ENERGY_STATUS */
+       RAPL_GFX_POLICY = BIT(13),      /* 0x642 MSR_PP1_POLICY */
+       RAPL_AMD_PWR_UNIT = BIT(14),    /* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */
+       RAPL_AMD_CORE_ENERGY_STAT = BIT(15),    /* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */
+       RAPL_AMD_PKG_ENERGY_STAT = BIT(16),     /* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */
+};
+
+#define RAPL_PKG       (RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT)
+#define RAPL_DRAM      (RAPL_DRAM_ENERGY_STATUS | RAPL_DRAM_POWER_LIMIT)
+#define RAPL_CORE      (RAPL_CORE_ENERGY_STATUS | RAPL_CORE_POWER_LIMIT)
+#define RAPL_GFX       (RAPL_GFX_POWER_LIMIT | RAPL_GFX_ENERGY_STATUS)
+
+#define RAPL_PKG_ALL   (RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO)
+#define RAPL_DRAM_ALL  (RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO)
+#define RAPL_CORE_ALL  (RAPL_CORE | RAPL_CORE_POLICY)
+#define RAPL_GFX_ALL   (RAPL_GFX | RAPL_GFX_POLIGY)
+
+#define RAPL_AMD_F17H  (RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT)
+
+/* For Cstates */
+enum cstates {
+       CC1 = BIT(0),
+       CC3 = BIT(1),
+       CC6 = BIT(2),
+       CC7 = BIT(3),
+       PC2 = BIT(4),
+       PC3 = BIT(5),
+       PC6 = BIT(6),
+       PC7 = BIT(7),
+       PC8 = BIT(8),
+       PC9 = BIT(9),
+       PC10 = BIT(10),
+};
+
+static const struct platform_features nhm_features = {
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_133MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
+       .cst_limit = CST_LIMIT_NHM,
+       .trl_msrs = TRL_BASE,
+};
+
+static const struct platform_features nhx_features = {
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_133MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
+       .cst_limit = CST_LIMIT_NHM,
+};
+
+static const struct platform_features snb_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+       .cst_limit = CST_LIMIT_SNB,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
+};
+
+static const struct platform_features snx_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+       .cst_limit = CST_LIMIT_SNB,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
+};
+
+static const struct platform_features ivb_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+       .cst_limit = CST_LIMIT_SNB,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
+};
+
+static const struct platform_features ivx_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+       .cst_limit = CST_LIMIT_SNB,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE | TRL_LIMIT1,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
+};
+
+static const struct platform_features hsw_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+       .cst_limit = CST_LIMIT_HSW,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
+       .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
+};
+
+static const struct platform_features hsx_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+       .cst_limit = CST_LIMIT_HSW,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
+       .plr_msrs = PLR_CORE | PLR_RING,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+       .has_fixed_rapl_unit = 1,
+};
+
+static const struct platform_features hswl_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
+       .cst_limit = CST_LIMIT_HSW,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
+       .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
+};
+
+static const struct platform_features hswg_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+       .cst_limit = CST_LIMIT_HSW,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
+       .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
+};
+
+static const struct platform_features bdw_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
+       .cst_limit = CST_LIMIT_HSW,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
+};
+
+static const struct platform_features bdwg_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
+       .cst_limit = CST_LIMIT_HSW,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
+};
+
+static const struct platform_features bdx_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | PC2 | PC3 | PC6,
+       .cst_limit = CST_LIMIT_HSW,
+       .has_irtl_msrs = 1,
+       .has_cst_auto_convension = 1,
+       .trl_msrs = TRL_BASE,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+       .has_fixed_rapl_unit = 1,
+};
+
+static const struct platform_features skl_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .crystal_freq = 24000000,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
+       .cst_limit = CST_LIMIT_HSW,
+       .has_irtl_msrs = 1,
+       .has_ext_cst_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .tcc_offset_bits = 6,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
+       .enable_tsc_tweak = 1,
+};
+
+static const struct platform_features cnl_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
+       .cst_limit = CST_LIMIT_HSW,
+       .has_irtl_msrs = 1,
+       .has_msr_core_c1_res = 1,
+       .has_ext_cst_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .tcc_offset_bits = 6,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
+       .enable_tsc_tweak = 1,
+};
+
+static const struct platform_features adl_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC8 | PC10,
+       .cst_limit = CST_LIMIT_HSW,
+       .has_irtl_msrs = 1,
+       .has_msr_core_c1_res = 1,
+       .has_ext_cst_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .tcc_offset_bits = 6,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
+       .enable_tsc_tweak = 1,
+};
+
+static const struct platform_features skx_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC6 | PC2 | PC6,
+       .cst_limit = CST_LIMIT_SKX,
+       .has_irtl_msrs = 1,
+       .has_cst_auto_convension = 1,
+       .trl_msrs = TRL_BASE | TRL_CORECOUNT,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+       .has_fixed_rapl_unit = 1,
+};
+
+static const struct platform_features icx_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC6 | PC2 | PC6,
+       .cst_limit = CST_LIMIT_ICX,
+       .has_irtl_msrs = 1,
+       .has_cst_prewake_bit = 1,
+       .trl_msrs = TRL_BASE | TRL_CORECOUNT,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+       .has_fixed_rapl_unit = 1,
+};
+
+static const struct platform_features spr_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC6 | PC2 | PC6,
+       .cst_limit = CST_LIMIT_SKX,
+       .has_msr_core_c1_res = 1,
+       .has_irtl_msrs = 1,
+       .has_cst_prewake_bit = 1,
+       .trl_msrs = TRL_BASE | TRL_CORECOUNT,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+};
+
+static const struct platform_features srf_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC6 | PC2 | PC6,
+       .cst_limit = CST_LIMIT_SKX,
+       .has_msr_core_c1_res = 1,
+       .has_msr_module_c6_res_ms = 1,
+       .has_irtl_msrs = 1,
+       .has_cst_prewake_bit = 1,
+       .trl_msrs = TRL_BASE | TRL_CORECOUNT,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+};
+
+static const struct platform_features grr_features = {
+       .has_msr_misc_feature_control = 1,
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC6,
+       .cst_limit = CST_LIMIT_SKX,
+       .has_msr_core_c1_res = 1,
+       .has_msr_module_c6_res_ms = 1,
+       .has_irtl_msrs = 1,
+       .has_cst_prewake_bit = 1,
+       .trl_msrs = TRL_BASE | TRL_CORECOUNT,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+};
+
+static const struct platform_features slv_features = {
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_SLV,
+       .supported_cstates = CC1 | CC6 | PC6,
+       .cst_limit = CST_LIMIT_SLV,
+       .has_msr_core_c1_res = 1,
+       .has_msr_module_c6_res_ms = 1,
+       .has_msr_c6_demotion_policy_config = 1,
+       .has_msr_atom_pkg_c6_residency = 1,
+       .trl_msrs = TRL_ATOM,
+       .rapl_msrs = RAPL_PKG | RAPL_CORE,
+       .has_rapl_divisor = 1,
+       .rapl_quirk_tdp = 30,
+};
+
+static const struct platform_features slvd_features = {
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_SLV,
+       .supported_cstates = CC1 | CC6 | PC3 | PC6,
+       .cst_limit = CST_LIMIT_SLV,
+       .has_msr_atom_pkg_c6_residency = 1,
+       .trl_msrs = TRL_BASE,
+       .rapl_msrs = RAPL_PKG | RAPL_CORE,
+       .rapl_quirk_tdp = 30,
+};
+
+static const struct platform_features amt_features = {
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_133MHZ,
+       .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
+       .cst_limit = CST_LIMIT_AMT,
+       .trl_msrs = TRL_BASE,
+};
+
+static const struct platform_features gmt_features = {
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .crystal_freq = 19200000,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
+       .cst_limit = CST_LIMIT_GMT,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE | TRL_CORECOUNT,
+       .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
+};
+
+static const struct platform_features gmtd_features = {
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .crystal_freq = 25000000,
+       .supported_cstates = CC1 | CC6 | PC2 | PC6,
+       .cst_limit = CST_LIMIT_GMT,
+       .has_irtl_msrs = 1,
+       .has_msr_core_c1_res = 1,
+       .trl_msrs = TRL_BASE | TRL_CORECOUNT,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
+};
+
+static const struct platform_features gmtp_features = {
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .crystal_freq = 19200000,
+       .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
+       .cst_limit = CST_LIMIT_GMT,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
+};
+
+static const struct platform_features tmt_features = {
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
+       .cst_limit = CST_LIMIT_GMT,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
+       .enable_tsc_tweak = 1,
+};
+
+static const struct platform_features tmtd_features = {
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC6,
+       .cst_limit = CST_LIMIT_GMT,
+       .has_irtl_msrs = 1,
+       .trl_msrs = TRL_BASE | TRL_CORECOUNT,
+       .rapl_msrs = RAPL_PKG_ALL,
+};
+
+static const struct platform_features knl_features = {
+       .has_msr_misc_pwr_mgmt = 1,
+       .has_nhm_msrs = 1,
+       .has_config_tdp = 1,
+       .bclk_freq = BCLK_100MHZ,
+       .supported_cstates = CC1 | CC6 | PC3 | PC6,
+       .cst_limit = CST_LIMIT_KNL,
+       .has_msr_knl_core_c6_residency = 1,
+       .trl_msrs = TRL_KNL,
+       .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
+       .has_fixed_rapl_unit = 1,
+       .need_perf_multiplier = 1,
+};
+
+static const struct platform_features default_features = {
+};
+
+static const struct platform_features amd_features_with_rapl = {
+       .rapl_msrs = RAPL_AMD_F17H,
+       .has_per_core_rapl = 1,
+       .rapl_quirk_tdp = 280,  /* This is the max stock TDP of HEDT/Server Fam17h+ chips */
+};
+
+static const struct platform_data turbostat_pdata[] = {
+       { INTEL_FAM6_NEHALEM, &nhm_features },
+       { INTEL_FAM6_NEHALEM_G, &nhm_features },
+       { INTEL_FAM6_NEHALEM_EP, &nhm_features },
+       { INTEL_FAM6_NEHALEM_EX, &nhx_features },
+       { INTEL_FAM6_WESTMERE, &nhm_features },
+       { INTEL_FAM6_WESTMERE_EP, &nhm_features },
+       { INTEL_FAM6_WESTMERE_EX, &nhx_features },
+       { INTEL_FAM6_SANDYBRIDGE, &snb_features },
+       { INTEL_FAM6_SANDYBRIDGE_X, &snx_features },
+       { INTEL_FAM6_IVYBRIDGE, &ivb_features },
+       { INTEL_FAM6_IVYBRIDGE_X, &ivx_features },
+       { INTEL_FAM6_HASWELL, &hsw_features },
+       { INTEL_FAM6_HASWELL_X, &hsx_features },
+       { INTEL_FAM6_HASWELL_L, &hswl_features },
+       { INTEL_FAM6_HASWELL_G, &hswg_features },
+       { INTEL_FAM6_BROADWELL, &bdw_features },
+       { INTEL_FAM6_BROADWELL_G, &bdwg_features },
+       { INTEL_FAM6_BROADWELL_X, &bdx_features },
+       { INTEL_FAM6_BROADWELL_D, &bdx_features },
+       { INTEL_FAM6_SKYLAKE_L, &skl_features },
+       { INTEL_FAM6_SKYLAKE, &skl_features },
+       { INTEL_FAM6_SKYLAKE_X, &skx_features },
+       { INTEL_FAM6_KABYLAKE_L, &skl_features },
+       { INTEL_FAM6_KABYLAKE, &skl_features },
+       { INTEL_FAM6_COMETLAKE, &skl_features },
+       { INTEL_FAM6_COMETLAKE_L, &skl_features },
+       { INTEL_FAM6_CANNONLAKE_L, &cnl_features },
+       { INTEL_FAM6_ICELAKE_X, &icx_features },
+       { INTEL_FAM6_ICELAKE_D, &icx_features },
+       { INTEL_FAM6_ICELAKE_L, &cnl_features },
+       { INTEL_FAM6_ICELAKE_NNPI, &cnl_features },
+       { INTEL_FAM6_ROCKETLAKE, &cnl_features },
+       { INTEL_FAM6_TIGERLAKE_L, &cnl_features },
+       { INTEL_FAM6_TIGERLAKE, &cnl_features },
+       { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features },
+       { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features },
+       { INTEL_FAM6_GRANITERAPIDS_X, &spr_features },
+       { INTEL_FAM6_LAKEFIELD, &cnl_features },
+       { INTEL_FAM6_ALDERLAKE, &adl_features },
+       { INTEL_FAM6_ALDERLAKE_L, &adl_features },
+       { INTEL_FAM6_RAPTORLAKE, &adl_features },
+       { INTEL_FAM6_RAPTORLAKE_P, &adl_features },
+       { INTEL_FAM6_RAPTORLAKE_S, &adl_features },
+       { INTEL_FAM6_METEORLAKE, &cnl_features },
+       { INTEL_FAM6_METEORLAKE_L, &cnl_features },
+       { INTEL_FAM6_ARROWLAKE, &cnl_features },
+       { INTEL_FAM6_LUNARLAKE_M, &cnl_features },
+       { INTEL_FAM6_ATOM_SILVERMONT, &slv_features },
+       { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features },
+       { INTEL_FAM6_ATOM_AIRMONT, &amt_features },
+       { INTEL_FAM6_ATOM_GOLDMONT, &gmt_features },
+       { INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features },
+       { INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features },
+       { INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features },
+       { INTEL_FAM6_ATOM_TREMONT, &tmt_features },
+       { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features },
+       { INTEL_FAM6_ATOM_GRACEMONT, &adl_features },
+       { INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features },
+       { INTEL_FAM6_ATOM_CRESTMONT, &grr_features },
+       { INTEL_FAM6_XEON_PHI_KNL, &knl_features },
+       { INTEL_FAM6_XEON_PHI_KNM, &knl_features },
+       /*
+        * Missing support for
+        * INTEL_FAM6_ICELAKE
+        * INTEL_FAM6_ATOM_SILVERMONT_MID
+        * INTEL_FAM6_ATOM_AIRMONT_MID
+        * INTEL_FAM6_ATOM_AIRMONT_NP
+        */
+       { 0, NULL },
+};
+
+static const struct platform_features *platform;
+
+void probe_platform_features(unsigned int family, unsigned int model)
+{
+       int i;
+
+       platform = &default_features;
+
+       if (authentic_amd || hygon_genuine) {
+               if (max_extended_level >= 0x80000007) {
+                       unsigned int eax, ebx, ecx, edx;
+
+                       __cpuid(0x80000007, eax, ebx, ecx, edx);
+                       /* RAPL (Fam 17h+) */
+                       if ((edx & (1 << 14)) && family >= 0x17)
+                               platform = &amd_features_with_rapl;
+               }
+               return;
+       }
+
+       if (!genuine_intel || family != 6)
+               return;
+
+       for (i = 0; turbostat_pdata[i].features; i++) {
+               if (turbostat_pdata[i].model == model) {
+                       platform = turbostat_pdata[i].features;
+                       return;
+               }
+       }
+}
+
+/* Model specific support End */
+
 #define        TJMAX_DEFAULT   100
 
 /* MSRs that are not yet in the kernel-provided header. */
@@ -333,8 +942,8 @@ int backwards_count;
 char *progname;
 
 #define CPU_SUBSET_MAXCPUS     1024    /* need to use before probe... */
-cpu_set_t *cpu_present_set, *cpu_affinity_set, *cpu_subset;
-size_t cpu_present_setsize, cpu_affinity_setsize, cpu_subset_size;
+cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
+size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
 #define MAX_ADDED_COUNTERS 8
 #define MAX_ADDED_THREAD_COUNTERS 24
 #define BITMASK_SIZE 32
@@ -355,12 +964,11 @@ struct thread_data {
        unsigned int x2apic_id;
        unsigned int flags;
        bool is_atom;
-#define CPU_IS_FIRST_THREAD_IN_CORE    0x2
-#define CPU_IS_FIRST_CORE_IN_PACKAGE   0x4
        unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
 } *thread_even, *thread_odd;
 
 struct core_data {
+       int base_cpu;
        unsigned long long c3;
        unsigned long long c6;
        unsigned long long c7;
@@ -373,6 +981,7 @@ struct core_data {
 } *core_even, *core_odd;
 
 struct pkg_data {
+       int base_cpu;
        unsigned long long pc2;
        unsigned long long pc3;
        unsigned long long pc6;
@@ -456,7 +1065,7 @@ off_t idx_to_offset(int idx)
 
        switch (idx) {
        case IDX_PKG_ENERGY:
-               if (do_rapl & RAPL_AMD_F17H)
+               if (platform->rapl_msrs & RAPL_AMD_F17H)
                        offset = MSR_PKG_ENERGY_STAT;
                else
                        offset = MSR_PKG_ENERGY_STATUS;
@@ -516,17 +1125,17 @@ int idx_valid(int idx)
 {
        switch (idx) {
        case IDX_PKG_ENERGY:
-               return do_rapl & (RAPL_PKG | RAPL_AMD_F17H);
+               return platform->rapl_msrs & (RAPL_PKG | RAPL_AMD_F17H);
        case IDX_DRAM_ENERGY:
-               return do_rapl & RAPL_DRAM;
+               return platform->rapl_msrs & RAPL_DRAM;
        case IDX_PP0_ENERGY:
-               return do_rapl & RAPL_CORES_ENERGY_STATUS;
+               return platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS;
        case IDX_PP1_ENERGY:
-               return do_rapl & RAPL_GFX;
+               return platform->rapl_msrs & RAPL_GFX;
        case IDX_PKG_PERF:
-               return do_rapl & RAPL_PKG_PERF_STATUS;
+               return platform->rapl_msrs & RAPL_PKG_PERF_STATUS;
        case IDX_DRAM_PERF:
-               return do_rapl & RAPL_DRAM_PERF_STATUS;
+               return platform->rapl_msrs & RAPL_DRAM_PERF_STATUS;
        default:
                return 0;
        }
@@ -563,6 +1172,9 @@ struct topo_params {
        int num_die;
        int num_cpus;
        int num_cores;
+       int allowed_packages;
+       int allowed_cpus;
+       int allowed_cores;
        int max_cpu_num;
        int max_node_num;
        int nodes_per_pkg;
@@ -575,7 +1187,7 @@ struct timeval tv_even, tv_odd, tv_delta;
 int *irq_column_2_cpu;         /* /proc/interrupts column numbers */
 int *irqs_per_cpu;             /* indexed by cpu_num */
 
-void setup_all_buffers(void);
+void setup_all_buffers(bool startup);
 
 char *sys_lpi_file;
 char *sys_lpi_file_sysfs = "/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us";
@@ -586,6 +1198,11 @@ int cpu_is_not_present(int cpu)
        return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set);
 }
 
+int cpu_is_not_allowed(int cpu)
+{
+       return !CPU_ISSET_S(cpu, cpu_allowed_setsize, cpu_allowed_set);
+}
+
 /*
  * run func(thread, core, package) in topology order
  * skip non-present cpus
@@ -603,10 +1220,9 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk
                                        struct thread_data *t;
                                        struct core_data *c;
                                        struct pkg_data *p;
-
                                        t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
 
-                                       if (cpu_is_not_present(t->cpu_id))
+                                       if (cpu_is_not_allowed(t->cpu_id))
                                                continue;
 
                                        c = GET_CORE(core_base, core_no, node_no, pkg_no);
@@ -622,6 +1238,25 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk
        return 0;
 }
 
+int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+       UNUSED(p);
+
+       return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0);
+}
+
+int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+       UNUSED(c);
+
+       return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0);
+}
+
+int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+       return is_cpu_first_thread_in_core(t, c, p) && is_cpu_first_core_in_package(t, c, p);
+}
+
 int cpu_migrate(int cpu)
 {
        CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
@@ -904,11 +1539,11 @@ void print_header(char *delim)
        if (DO_BIC(BIC_CORE_THROT_CNT))
                outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : ""));
 
-       if (do_rapl && !rapl_joules) {
-               if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY))
+       if (platform->rapl_msrs && !rapl_joules) {
+               if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
                        outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
-       } else if (do_rapl && rapl_joules) {
-               if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY))
+       } else if (platform->rapl_msrs && rapl_joules) {
+               if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
                        outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
        }
 
@@ -966,10 +1601,10 @@ void print_header(char *delim)
        if (DO_BIC(BIC_SYS_LPI))
                outp += sprintf(outp, "%sSYS%%LPI", (printed++ ? delim : ""));
 
-       if (do_rapl && !rapl_joules) {
+       if (platform->rapl_msrs && !rapl_joules) {
                if (DO_BIC(BIC_PkgWatt))
                        outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : ""));
-               if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY))
+               if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
                        outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
                if (DO_BIC(BIC_GFXWatt))
                        outp += sprintf(outp, "%sGFXWatt", (printed++ ? delim : ""));
@@ -979,10 +1614,10 @@ void print_header(char *delim)
                        outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
                if (DO_BIC(BIC_RAM__))
                        outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
-       } else if (do_rapl && rapl_joules) {
+       } else if (platform->rapl_msrs && rapl_joules) {
                if (DO_BIC(BIC_Pkg_J))
                        outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : ""));
-               if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY))
+               if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
                        outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
                if (DO_BIC(BIC_GFX_J))
                        outp += sprintf(outp, "%sGFX_J", (printed++ ? delim : ""));
@@ -1106,11 +1741,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
        int printed = 0;
 
        /* if showing only 1st thread in core and this isn't one, bail out */
-       if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+       if (show_core_only && !is_cpu_first_thread_in_core(t, c, p))
                return 0;
 
        /* if showing only 1st thread in pkg and this isn't one, bail out */
-       if (show_pkg_only && !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+       if (show_pkg_only && !is_cpu_first_core_in_package(t, c, p))
                return 0;
 
        /*if not summary line and --cpu is used */
@@ -1244,7 +1879,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
                outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc);
 
        /* print per-core data only for 1st thread in core */
-       if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+       if (!is_cpu_first_thread_in_core(t, c, p))
                goto done;
 
        if (DO_BIC(BIC_CPU_c3))
@@ -1284,14 +1919,14 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 
        fmt8 = "%s%.2f";
 
-       if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY))
+       if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
                outp +=
                    sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float);
-       if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY))
+       if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
                outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units);
 
        /* print per-package data only for 1st core in package */
-       if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+       if (!is_cpu_first_core_in_package(t, c, p))
                goto done;
 
        /* PkgTmp */
@@ -1352,7 +1987,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
                outp +=
                    sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float);
 
-       if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY))
+       if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
                outp +=
                    sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float);
        if (DO_BIC(BIC_GFXWatt))
@@ -1364,7 +1999,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
                            p->energy_dram * rapl_dram_energy_units / interval_float);
        if (DO_BIC(BIC_Pkg_J))
                outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units);
-       if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY))
+       if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
                outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units);
        if (DO_BIC(BIC_GFX_J))
                outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units);
@@ -1527,7 +2162,7 @@ void delta_core(struct core_data *new, struct core_data *old)
 
 int soft_c1_residency_display(int bic)
 {
-       if (!DO_BIC(BIC_CPU_c1) || use_c1_residency_msr)
+       if (!DO_BIC(BIC_CPU_c1) || platform->has_msr_core_c1_res)
                return 0;
 
        return DO_BIC_READ(bic);
@@ -1567,7 +2202,8 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
 
        old->c1 = new->c1 - old->c1;
 
-       if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) {
+       if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
+           || soft_c1_residency_display(BIC_Avg_MHz)) {
                if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) {
                        old->aperf = new->aperf - old->aperf;
                        old->mperf = new->mperf - old->mperf;
@@ -1576,7 +2212,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
                }
        }
 
-       if (use_c1_residency_msr) {
+       if (platform->has_msr_core_c1_res) {
                /*
                 * Some models have a dedicated C1 residency MSR,
                 * which should be more accurate than the derivation below.
@@ -1626,7 +2262,7 @@ int delta_cpu(struct thread_data *t, struct core_data *c,
        int retval = 0;
 
        /* calculate core delta only for 1st thread in core */
-       if (t->flags & CPU_IS_FIRST_THREAD_IN_CORE)
+       if (is_cpu_first_thread_in_core(t, c, p))
                delta_core(c, c2);
 
        /* always calculate thread delta */
@@ -1635,7 +2271,7 @@ int delta_cpu(struct thread_data *t, struct core_data *c,
                return retval;
 
        /* calculate package delta only for 1st core in package */
-       if (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)
+       if (is_cpu_first_core_in_package(t, c, p))
                retval = delta_package(p, p2);
 
        return retval;
@@ -1663,9 +2299,6 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
        t->irq_count = 0;
        t->smi_count = 0;
 
-       /* tells format_counters to dump all fields from this set */
-       t->flags = CPU_IS_FIRST_THREAD_IN_CORE | CPU_IS_FIRST_CORE_IN_PACKAGE;
-
        c->c3 = 0;
        c->c6 = 0;
        c->c7 = 0;
@@ -1749,7 +2382,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
        }
 
        /* sum per-core values only for 1st thread in core */
-       if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+       if (!is_cpu_first_thread_in_core(t, c, p))
                return 0;
 
        average.cores.c3 += c->c3;
@@ -1769,7 +2402,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
        }
 
        /* sum per-pkg values only for 1st core in pkg */
-       if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+       if (!is_cpu_first_core_in_package(t, c, p))
                return 0;
 
        if (DO_BIC(BIC_Totl_c0))
@@ -1834,40 +2467,40 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
        /* Use the global time delta for the average. */
        average.threads.tv_delta = tv_delta;
 
-       average.threads.tsc /= topo.num_cpus;
-       average.threads.aperf /= topo.num_cpus;
-       average.threads.mperf /= topo.num_cpus;
-       average.threads.instr_count /= topo.num_cpus;
-       average.threads.c1 /= topo.num_cpus;
+       average.threads.tsc /= topo.allowed_cpus;
+       average.threads.aperf /= topo.allowed_cpus;
+       average.threads.mperf /= topo.allowed_cpus;
+       average.threads.instr_count /= topo.allowed_cpus;
+       average.threads.c1 /= topo.allowed_cpus;
 
        if (average.threads.irq_count > 9999999)
                sums_need_wide_columns = 1;
 
-       average.cores.c3 /= topo.num_cores;
-       average.cores.c6 /= topo.num_cores;
-       average.cores.c7 /= topo.num_cores;
-       average.cores.mc6_us /= topo.num_cores;
+       average.cores.c3 /= topo.allowed_cores;
+       average.cores.c6 /= topo.allowed_cores;
+       average.cores.c7 /= topo.allowed_cores;
+       average.cores.mc6_us /= topo.allowed_cores;
 
        if (DO_BIC(BIC_Totl_c0))
-               average.packages.pkg_wtd_core_c0 /= topo.num_packages;
+               average.packages.pkg_wtd_core_c0 /= topo.allowed_packages;
        if (DO_BIC(BIC_Any_c0))
-               average.packages.pkg_any_core_c0 /= topo.num_packages;
+               average.packages.pkg_any_core_c0 /= topo.allowed_packages;
        if (DO_BIC(BIC_GFX_c0))
-               average.packages.pkg_any_gfxe_c0 /= topo.num_packages;
+               average.packages.pkg_any_gfxe_c0 /= topo.allowed_packages;
        if (DO_BIC(BIC_CPUGFX))
-               average.packages.pkg_both_core_gfxe_c0 /= topo.num_packages;
+               average.packages.pkg_both_core_gfxe_c0 /= topo.allowed_packages;
 
-       average.packages.pc2 /= topo.num_packages;
+       average.packages.pc2 /= topo.allowed_packages;
        if (DO_BIC(BIC_Pkgpc3))
-               average.packages.pc3 /= topo.num_packages;
+               average.packages.pc3 /= topo.allowed_packages;
        if (DO_BIC(BIC_Pkgpc6))
-               average.packages.pc6 /= topo.num_packages;
+               average.packages.pc6 /= topo.allowed_packages;
        if (DO_BIC(BIC_Pkgpc7))
-               average.packages.pc7 /= topo.num_packages;
+               average.packages.pc7 /= topo.allowed_packages;
 
-       average.packages.pc8 /= topo.num_packages;
-       average.packages.pc9 /= topo.num_packages;
-       average.packages.pc10 /= topo.num_packages;
+       average.packages.pc8 /= topo.allowed_packages;
+       average.packages.pc9 /= topo.allowed_packages;
+       average.packages.pc10 /= topo.allowed_packages;
 
        for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
                if (mp->format == FORMAT_RAW)
@@ -1877,7 +2510,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
                                sums_need_wide_columns = 1;
                        continue;
                }
-               average.threads.counter[i] /= topo.num_cpus;
+               average.threads.counter[i] /= topo.allowed_cpus;
        }
        for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
                if (mp->format == FORMAT_RAW)
@@ -1886,7 +2519,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
                        if (average.cores.counter[i] > 9999999)
                                sums_need_wide_columns = 1;
                }
-               average.cores.counter[i] /= topo.num_cores;
+               average.cores.counter[i] /= topo.allowed_cores;
        }
        for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
                if (mp->format == FORMAT_RAW)
@@ -1895,7 +2528,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
                        if (average.packages.counter[i] > 9999999)
                                sums_need_wide_columns = 1;
                }
-               average.packages.counter[i] /= topo.num_packages;
+               average.packages.counter[i] /= topo.allowed_packages;
        }
 }
 
@@ -2092,7 +2725,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 retry:
        t->tsc = rdtsc();       /* we are running on local CPU of interest */
 
-       if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) {
+       if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
+           || soft_c1_residency_display(BIC_Avg_MHz)) {
                unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
 
                /*
@@ -2158,7 +2792,7 @@ retry:
                        return -5;
                t->smi_count = msr & 0xFFFFFFFF;
        }
-       if (DO_BIC(BIC_CPU_c1) && use_c1_residency_msr) {
+       if (DO_BIC(BIC_CPU_c1) && platform->has_msr_core_c1_res) {
                if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1))
                        return -6;
        }
@@ -2169,7 +2803,7 @@ retry:
        }
 
        /* collect core counters only for 1st thread in core */
-       if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
+       if (!is_cpu_first_thread_in_core(t, c, p))
                goto done;
 
        if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) {
@@ -2177,10 +2811,10 @@ retry:
                        return -6;
        }
 
-       if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !do_knl_cstates) {
+       if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !platform->has_msr_knl_core_c6_residency) {
                if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6))
                        return -7;
-       } else if (do_knl_cstates || soft_c1_residency_display(BIC_CPU_c6)) {
+       } else if (platform->has_msr_knl_core_c6_residency && soft_c1_residency_display(BIC_CPU_c6)) {
                if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6))
                        return -7;
        }
@@ -2212,7 +2846,7 @@ retry:
        if (DO_BIC(BIC_CORE_THROT_CNT))
                get_core_throt_cnt(cpu, &c->core_throt_cnt);
 
-       if (do_rapl & RAPL_AMD_F17H) {
+       if (platform->rapl_msrs & RAPL_AMD_F17H) {
                if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr))
                        return -14;
                c->core_energy = msr & 0xFFFFFFFF;
@@ -2224,7 +2858,7 @@ retry:
        }
 
        /* collect package counters only for 1st core in package */
-       if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+       if (!is_cpu_first_core_in_package(t, c, p))
                goto done;
 
        if (DO_BIC(BIC_Totl_c0)) {
@@ -2247,7 +2881,7 @@ retry:
                if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3))
                        return -9;
        if (DO_BIC(BIC_Pkgpc6)) {
-               if (do_slm_cstates) {
+               if (platform->has_msr_atom_pkg_c6_residency) {
                        if (get_msr(cpu, MSR_ATOM_PKG_C6_RESIDENCY, &p->pc6))
                                return -10;
                } else {
@@ -2277,37 +2911,37 @@ retry:
        if (DO_BIC(BIC_SYS_LPI))
                p->sys_lpi = cpuidle_cur_sys_lpi_us;
 
-       if (do_rapl & RAPL_PKG) {
+       if (platform->rapl_msrs & RAPL_PKG) {
                if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr))
                        return -13;
                p->energy_pkg = msr;
        }
-       if (do_rapl & RAPL_CORES_ENERGY_STATUS) {
+       if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) {
                if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr))
                        return -14;
                p->energy_cores = msr;
        }
-       if (do_rapl & RAPL_DRAM) {
+       if (platform->rapl_msrs & RAPL_DRAM) {
                if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr))
                        return -15;
                p->energy_dram = msr;
        }
-       if (do_rapl & RAPL_GFX) {
+       if (platform->rapl_msrs & RAPL_GFX) {
                if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr))
                        return -16;
                p->energy_gfx = msr;
        }
-       if (do_rapl & RAPL_PKG_PERF_STATUS) {
+       if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) {
                if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr))
                        return -16;
                p->rapl_pkg_perf_status = msr;
        }
-       if (do_rapl & RAPL_DRAM_PERF_STATUS) {
+       if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) {
                if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr))
                        return -16;
                p->rapl_dram_perf_status = msr;
        }
-       if (do_rapl & RAPL_AMD_F17H) {
+       if (platform->rapl_msrs & RAPL_AMD_F17H) {
                if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr))
                        return -13;
                p->energy_pkg = msr;
@@ -2414,18 +3048,58 @@ int icx_pkg_cstate_limits[16] =
        PCLRSV, PCLRSV
 };
 
-static void calculate_tsc_tweak()
+void probe_cst_limit(void)
 {
-       tsc_tweak = base_hz / tsc_hz;
-}
+       unsigned long long msr;
+       int *pkg_cstate_limits;
+
+       if (!platform->has_nhm_msrs)
+               return;
+
+       switch (platform->cst_limit) {
+       case CST_LIMIT_NHM:
+               pkg_cstate_limits = nhm_pkg_cstate_limits;
+               break;
+       case CST_LIMIT_SNB:
+               pkg_cstate_limits = snb_pkg_cstate_limits;
+               break;
+       case CST_LIMIT_HSW:
+               pkg_cstate_limits = hsw_pkg_cstate_limits;
+               break;
+       case CST_LIMIT_SKX:
+               pkg_cstate_limits = skx_pkg_cstate_limits;
+               break;
+       case CST_LIMIT_ICX:
+               pkg_cstate_limits = icx_pkg_cstate_limits;
+               break;
+       case CST_LIMIT_SLV:
+               pkg_cstate_limits = slv_pkg_cstate_limits;
+               break;
+       case CST_LIMIT_AMT:
+               pkg_cstate_limits = amt_pkg_cstate_limits;
+               break;
+       case CST_LIMIT_KNL:
+               pkg_cstate_limits = phi_pkg_cstate_limits;
+               break;
+       case CST_LIMIT_GMT:
+               pkg_cstate_limits = glm_pkg_cstate_limits;
+               break;
+       default:
+               return;
+       }
 
-void prewake_cstate_probe(unsigned int family, unsigned int model);
+       get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
+       pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
+}
 
-static void dump_nhm_platform_info(void)
+static void dump_platform_info(void)
 {
        unsigned long long msr;
        unsigned int ratio;
 
+       if (!platform->has_nhm_msrs)
+               return;
+
        get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
 
        fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr);
@@ -2435,19 +3109,27 @@ static void dump_nhm_platform_info(void)
 
        ratio = (msr >> 8) & 0xFF;
        fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
+}
+
+static void dump_power_ctl(void)
+{
+       unsigned long long msr;
+
+       if (!platform->has_nhm_msrs)
+               return;
 
        get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
        fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n",
                base_cpu, msr, msr & 0x2 ? "EN" : "DIS");
 
        /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */
-       if (dis_cstate_prewake)
+       if (platform->has_cst_prewake_bit)
                fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN");
 
        return;
 }
 
-static void dump_hsw_turbo_ratio_limits(void)
+static void dump_turbo_ratio_limit2(void)
 {
        unsigned long long msr;
        unsigned int ratio;
@@ -2466,7 +3148,7 @@ static void dump_hsw_turbo_ratio_limits(void)
        return;
 }
 
-static void dump_ivt_turbo_ratio_limits(void)
+static void dump_turbo_ratio_limit1(void)
 {
        unsigned long long msr;
        unsigned int ratio;
@@ -2509,29 +3191,7 @@ static void dump_ivt_turbo_ratio_limits(void)
        return;
 }
 
-int has_turbo_ratio_group_limits(int family, int model)
-{
-
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_ATOM_GOLDMONT:
-       case INTEL_FAM6_SKYLAKE_X:
-       case INTEL_FAM6_ICELAKE_X:
-       case INTEL_FAM6_SAPPHIRERAPIDS_X:
-       case INTEL_FAM6_ATOM_GOLDMONT_D:
-       case INTEL_FAM6_ATOM_TREMONT_D:
-               return 1;
-       default:
-               return 0;
-       }
-}
-
-static void dump_turbo_ratio_limits(int trl_msr_offset, int family, int model)
+static void dump_turbo_ratio_limits(int trl_msr_offset)
 {
        unsigned long long msr, core_counts;
        int shift;
@@ -2540,7 +3200,7 @@ static void dump_turbo_ratio_limits(int trl_msr_offset, int family, int model)
        fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n",
                base_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr);
 
-       if (has_turbo_ratio_group_limits(family, model)) {
+       if (platform->trl_msrs & TRL_CORECOUNT) {
                get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts);
                fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, core_counts);
        } else {
@@ -2657,10 +3317,13 @@ static void dump_knl_turbo_ratio_limits(void)
                                ratio[i], bclk, ratio[i] * bclk, cores[i]);
 }
 
-static void dump_nhm_cst_cfg(void)
+static void dump_cst_cfg(void)
 {
        unsigned long long msr;
 
+       if (!platform->has_nhm_msrs)
+               return;
+
        get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
 
        fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", base_cpu, msr);
@@ -2673,7 +3336,7 @@ static void dump_nhm_cst_cfg(void)
                (msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]);
 
 #define AUTOMATIC_CSTATE_CONVERSION            (1UL << 16)
-       if (has_automatic_cstate_conversion) {
+       if (platform->has_cst_auto_convension) {
                fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off");
        }
 
@@ -2730,39 +3393,50 @@ void print_irtl(void)
 {
        unsigned long long msr;
 
-       get_msr(base_cpu, MSR_PKGC3_IRTL, &msr);
-       fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr);
-       fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-               (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
-
-       get_msr(base_cpu, MSR_PKGC6_IRTL, &msr);
-       fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr);
-       fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-               (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+       if (!platform->has_irtl_msrs)
+               return;
 
-       get_msr(base_cpu, MSR_PKGC7_IRTL, &msr);
-       fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr);
-       fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-               (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+       if (platform->supported_cstates & PC3) {
+               get_msr(base_cpu, MSR_PKGC3_IRTL, &msr);
+               fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr);
+               fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+                       (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+       }
 
-       if (!do_irtl_hsw)
-               return;
+       if (platform->supported_cstates & PC6) {
+               get_msr(base_cpu, MSR_PKGC6_IRTL, &msr);
+               fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr);
+               fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+                       (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+       }
 
-       get_msr(base_cpu, MSR_PKGC8_IRTL, &msr);
-       fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr);
-       fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-               (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+       if (platform->supported_cstates & PC7) {
+               get_msr(base_cpu, MSR_PKGC7_IRTL, &msr);
+               fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr);
+               fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+                       (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+       }
 
-       get_msr(base_cpu, MSR_PKGC9_IRTL, &msr);
-       fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr);
-       fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-               (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+       if (platform->supported_cstates & PC8) {
+               get_msr(base_cpu, MSR_PKGC8_IRTL, &msr);
+               fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr);
+               fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+                       (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+       }
 
-       get_msr(base_cpu, MSR_PKGC10_IRTL, &msr);
-       fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr);
-       fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
-               (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+       if (platform->supported_cstates & PC9) {
+               get_msr(base_cpu, MSR_PKGC9_IRTL, &msr);
+               fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr);
+               fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+                       (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+       }
 
+       if (platform->supported_cstates & PC10) {
+               get_msr(base_cpu, MSR_PKGC10_IRTL, &msr);
+               fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr);
+               fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
+                       (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
+       }
 }
 
 void free_fd_percpu(void)
@@ -2785,6 +3459,14 @@ void free_all_buffers(void)
        cpu_present_set = NULL;
        cpu_present_setsize = 0;
 
+       CPU_FREE(cpu_effective_set);
+       cpu_effective_set = NULL;
+       cpu_effective_setsize = 0;
+
+       CPU_FREE(cpu_allowed_set);
+       cpu_allowed_set = NULL;
+       cpu_allowed_setsize = 0;
+
        CPU_FREE(cpu_affinity_set);
        cpu_affinity_set = NULL;
        cpu_affinity_setsize = 0;
@@ -2927,49 +3609,102 @@ int get_physical_node_id(struct cpu_topology *thiscpu)
        return -1;
 }
 
-int get_thread_siblings(struct cpu_topology *thiscpu)
+static int parse_cpu_str(char *cpu_str, cpu_set_t *cpu_set, int cpu_set_size)
 {
-       char path[80], character;
-       FILE *filep;
-       unsigned long map;
-       int so, shift, sib_core;
-       int cpu = thiscpu->logical_cpu_id;
-       int offset = topo.max_cpu_num + 1;
-       size_t size;
-       int thread_id = 0;
+       unsigned int start, end;
+       char *next = cpu_str;
 
-       thiscpu->put_ids = CPU_ALLOC((topo.max_cpu_num + 1));
-       if (thiscpu->thread_id < 0)
-               thiscpu->thread_id = thread_id++;
-       if (!thiscpu->put_ids)
-               return -1;
+       while (next && *next) {
 
-       size = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
-       CPU_ZERO_S(size, thiscpu->put_ids);
+               if (*next == '-')       /* no negative cpu numbers */
+                       return 1;
 
-       sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu);
-       filep = fopen(path, "r");
+               start = strtoul(next, &next, 10);
 
-       if (!filep) {
-               warnx("%s: open failed", path);
-               return -1;
-       }
-       do {
-               offset -= BITMASK_SIZE;
-               if (fscanf(filep, "%lx%c", &map, &character) != 2)
-                       err(1, "%s: failed to parse file", path);
-               for (shift = 0; shift < BITMASK_SIZE; shift++) {
-                       if ((map >> shift) & 0x1) {
-                               so = shift + offset;
-                               sib_core = get_core_id(so);
-                               if (sib_core == thiscpu->physical_core_id) {
-                                       CPU_SET_S(so, size, thiscpu->put_ids);
-                                       if ((so != cpu) && (cpus[so].thread_id < 0))
-                                               cpus[so].thread_id = thread_id++;
-                               }
-                       }
-               }
-       } while (character == ',');
+               if (start >= CPU_SUBSET_MAXCPUS)
+                       return 1;
+               CPU_SET_S(start, cpu_set_size, cpu_set);
+
+               if (*next == '\0' || *next == '\n')
+                       break;
+
+               if (*next == ',') {
+                       next += 1;
+                       continue;
+               }
+
+               if (*next == '-') {
+                       next += 1;      /* start range */
+               } else if (*next == '.') {
+                       next += 1;
+                       if (*next == '.')
+                               next += 1;      /* start range */
+                       else
+                               return 1;
+               }
+
+               end = strtoul(next, &next, 10);
+               if (end <= start)
+                       return 1;
+
+               while (++start <= end) {
+                       if (start >= CPU_SUBSET_MAXCPUS)
+                               return 1;
+                       CPU_SET_S(start, cpu_set_size, cpu_set);
+               }
+
+               if (*next == ',')
+                       next += 1;
+               else if (*next != '\0' && *next != '\n')
+                       return 1;
+       }
+
+       return 0;
+}
+
+int get_thread_siblings(struct cpu_topology *thiscpu)
+{
+       char path[80], character;
+       FILE *filep;
+       unsigned long map;
+       int so, shift, sib_core;
+       int cpu = thiscpu->logical_cpu_id;
+       int offset = topo.max_cpu_num + 1;
+       size_t size;
+       int thread_id = 0;
+
+       thiscpu->put_ids = CPU_ALLOC((topo.max_cpu_num + 1));
+       if (thiscpu->thread_id < 0)
+               thiscpu->thread_id = thread_id++;
+       if (!thiscpu->put_ids)
+               return -1;
+
+       size = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
+       CPU_ZERO_S(size, thiscpu->put_ids);
+
+       sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu);
+       filep = fopen(path, "r");
+
+       if (!filep) {
+               warnx("%s: open failed", path);
+               return -1;
+       }
+       do {
+               offset -= BITMASK_SIZE;
+               if (fscanf(filep, "%lx%c", &map, &character) != 2)
+                       err(1, "%s: failed to parse file", path);
+               for (shift = 0; shift < BITMASK_SIZE; shift++) {
+                       if ((map >> shift) & 0x1) {
+                               so = shift + offset;
+                               sib_core = get_core_id(so);
+                               if (sib_core == thiscpu->physical_core_id) {
+                                       CPU_SET_S(so, size, thiscpu->put_ids);
+                                       if ((so != cpu) && (cpus[so].thread_id < 0))
+                                               cpus[so].thread_id = thread_id++;
+                               }
+                       }
+               }
+       } while (character == ',');
        fclose(filep);
 
        return CPU_COUNT_S(size, thiscpu->put_ids);
@@ -2998,7 +3733,7 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
 
                                        t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
 
-                                       if (cpu_is_not_present(t->cpu_id))
+                                       if (cpu_is_not_allowed(t->cpu_id))
                                                continue;
 
                                        t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no);
@@ -3050,11 +3785,51 @@ int for_all_proc_cpus(int (func) (int))
        return 0;
 }
 
+#define PATH_EFFECTIVE_CPUS    "/sys/fs/cgroup/cpuset.cpus.effective"
+
+static char cpu_effective_str[1024];
+
+static int update_effective_str(bool startup)
+{
+       FILE *fp;
+       char *pos;
+       char buf[1024];
+       int ret;
+
+       if (cpu_effective_str[0] == '\0' && !startup)
+               return 0;
+
+       fp = fopen(PATH_EFFECTIVE_CPUS, "r");
+       if (!fp)
+               return 0;
+
+       pos = fgets(buf, 1024, fp);
+       if (!pos)
+               err(1, "%s: file read failed\n", PATH_EFFECTIVE_CPUS);
+
+       fclose(fp);
+
+       ret = strncmp(cpu_effective_str, buf, 1024);
+       if (!ret)
+               return 0;
+
+       strncpy(cpu_effective_str, buf, 1024);
+       return 1;
+}
+
+static void update_effective_set(bool startup)
+{
+       update_effective_str(startup);
+
+       if (parse_cpu_str(cpu_effective_str, cpu_effective_set, cpu_effective_setsize))
+               err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str);
+}
+
 void re_initialize(void)
 {
        free_all_buffers();
-       setup_all_buffers();
-       fprintf(outf, "turbostat: re-initialized with num_cpus %d\n", topo.num_cpus);
+       setup_all_buffers(false);
+       fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus);
 }
 
 void set_max_cpu_num(void)
@@ -3191,8 +3966,8 @@ int snapshot_gfx_rc6_ms(void)
 /*
  * snapshot_gfx_mhz()
  *
- * record snapshot of
- * /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz
+ * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz
+ * when /sys/class/drm/card0/gt_cur_freq_mhz is not available.
  *
  * return 1 if config change requires a restart, else return 0
  */
@@ -3201,9 +3976,11 @@ int snapshot_gfx_mhz(void)
        static FILE *fp;
        int retval;
 
-       if (fp == NULL)
-               fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r");
-       else {
+       if (fp == NULL) {
+               fp = fopen("/sys/class/drm/card0/gt_cur_freq_mhz", "r");
+               if (!fp)
+                       fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r");
+       } else {
                rewind(fp);
                fflush(fp);
        }
@@ -3218,8 +3995,8 @@ int snapshot_gfx_mhz(void)
 /*
  * snapshot_gfx_cur_mhz()
  *
- * record snapshot of
- * /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz
+ * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz
+ * when /sys/class/drm/card0/gt_act_freq_mhz is not available.
  *
  * return 1 if config change requires a restart, else return 0
  */
@@ -3228,9 +4005,11 @@ int snapshot_gfx_act_mhz(void)
        static FILE *fp;
        int retval;
 
-       if (fp == NULL)
-               fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r");
-       else {
+       if (fp == NULL) {
+               fp = fopen("/sys/class/drm/card0/gt_act_freq_mhz", "r");
+               if (!fp)
+                       fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r");
+       } else {
                rewind(fp);
                fflush(fp);
        }
@@ -3562,6 +4341,10 @@ restart:
                        re_initialize();
                        goto restart;
                }
+               if (update_effective_str(false)) {
+                       re_initialize();
+                       goto restart;
+               }
                do_sleep();
                if (snapshot_proc_sysfs_files())
                        goto restart;
@@ -3674,500 +4457,133 @@ void check_permissions(void)
                exit(-6);
 }
 
-/*
- * NHM adds support for additional MSRs:
- *
- * MSR_SMI_COUNT                   0x00000034
- *
- * MSR_PLATFORM_INFO               0x000000ce
- * MSR_PKG_CST_CONFIG_CONTROL     0x000000e2
- *
- * MSR_MISC_PWR_MGMT               0x000001aa
- *
- * MSR_PKG_C3_RESIDENCY            0x000003f8
- * MSR_PKG_C6_RESIDENCY            0x000003f9
- * MSR_CORE_C3_RESIDENCY           0x000003fc
- * MSR_CORE_C6_RESIDENCY           0x000003fd
- *
- * Side effect:
- * sets global pkg_cstate_limit to decode MSR_PKG_CST_CONFIG_CONTROL
- * sets has_misc_feature_control
- */
-int probe_nhm_msrs(unsigned int family, unsigned int model)
+void probe_bclk(void)
 {
        unsigned long long msr;
        unsigned int base_ratio;
-       int *pkg_cstate_limits;
 
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       bclk = discover_bclk(family, model);
+       if (!platform->has_nhm_msrs)
+               return;
 
-       switch (model) {
-       case INTEL_FAM6_NEHALEM:        /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */
-       case INTEL_FAM6_NEHALEM_EX:     /* Nehalem-EX Xeon - Beckton */
-               pkg_cstate_limits = nhm_pkg_cstate_limits;
-               break;
-       case INTEL_FAM6_SANDYBRIDGE:    /* SNB */
-       case INTEL_FAM6_SANDYBRIDGE_X:  /* SNB Xeon */
-       case INTEL_FAM6_IVYBRIDGE:      /* IVB */
-       case INTEL_FAM6_IVYBRIDGE_X:    /* IVB Xeon */
-               pkg_cstate_limits = snb_pkg_cstate_limits;
-               has_misc_feature_control = 1;
-               break;
-       case INTEL_FAM6_HASWELL:        /* HSW */
-       case INTEL_FAM6_HASWELL_G:      /* HSW */
-       case INTEL_FAM6_HASWELL_X:      /* HSX */
-       case INTEL_FAM6_HASWELL_L:      /* HSW */
-       case INTEL_FAM6_BROADWELL:      /* BDW */
-       case INTEL_FAM6_BROADWELL_G:    /* BDW */
-       case INTEL_FAM6_BROADWELL_X:    /* BDX */
-       case INTEL_FAM6_SKYLAKE_L:      /* SKL */
-       case INTEL_FAM6_CANNONLAKE_L:   /* CNL */
-               pkg_cstate_limits = hsw_pkg_cstate_limits;
-               has_misc_feature_control = 1;
-               break;
-       case INTEL_FAM6_SKYLAKE_X:      /* SKX */
-       case INTEL_FAM6_SAPPHIRERAPIDS_X:       /* SPR */
-               pkg_cstate_limits = skx_pkg_cstate_limits;
-               has_misc_feature_control = 1;
-               break;
-       case INTEL_FAM6_ICELAKE_X:      /* ICX */
-               pkg_cstate_limits = icx_pkg_cstate_limits;
-               has_misc_feature_control = 1;
-               break;
-       case INTEL_FAM6_ATOM_SILVERMONT:        /* BYT */
-               no_MSR_MISC_PWR_MGMT = 1;
-               /* FALLTHRU */
-       case INTEL_FAM6_ATOM_SILVERMONT_D:      /* AVN */
-               pkg_cstate_limits = slv_pkg_cstate_limits;
-               break;
-       case INTEL_FAM6_ATOM_AIRMONT:   /* AMT */
-               pkg_cstate_limits = amt_pkg_cstate_limits;
-               no_MSR_MISC_PWR_MGMT = 1;
-               break;
-       case INTEL_FAM6_XEON_PHI_KNL:   /* PHI */
-               pkg_cstate_limits = phi_pkg_cstate_limits;
-               break;
-       case INTEL_FAM6_ATOM_GOLDMONT:  /* BXT */
-       case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-       case INTEL_FAM6_ATOM_GOLDMONT_D:        /* DNV */
-       case INTEL_FAM6_ATOM_TREMONT:   /* EHL */
-       case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */
-               pkg_cstate_limits = glm_pkg_cstate_limits;
-               break;
-       default:
-               return 0;
-       }
-       get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
-       pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
+       if (platform->bclk_freq == BCLK_100MHZ)
+               bclk = 100.00;
+       else if (platform->bclk_freq == BCLK_133MHZ)
+               bclk = 133.33;
+       else if (platform->bclk_freq == BCLK_SLV)
+               bclk = slm_bclk();
+       else
+               return;
 
        get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
        base_ratio = (msr >> 8) & 0xFF;
 
        base_hz = base_ratio * bclk * 1000000;
        has_base_hz = 1;
-       return 1;
-}
 
-/*
- * SLV client has support for unique MSRs:
- *
- * MSR_CC6_DEMOTION_POLICY_CONFIG
- * MSR_MC6_DEMOTION_POLICY_CONFIG
- */
+       if (platform->enable_tsc_tweak)
+               tsc_tweak = base_hz / tsc_hz;
+}
 
-int has_slv_msrs(unsigned int family, unsigned int model)
+static void remove_underbar(char *s)
 {
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
+       char *to = s;
 
-       switch (model) {
-       case INTEL_FAM6_ATOM_SILVERMONT:
-       case INTEL_FAM6_ATOM_SILVERMONT_MID:
-       case INTEL_FAM6_ATOM_AIRMONT_MID:
-               return 1;
+       while (*s) {
+               if (*s != '_')
+                       *to++ = *s;
+               s++;
        }
-       return 0;
+
+       *to = 0;
 }
 
-int is_dnv(unsigned int family, unsigned int model)
+static void dump_turbo_ratio_info(void)
 {
+       if (!has_turbo)
+               return;
 
-       if (!genuine_intel)
-               return 0;
+       if (!platform->has_nhm_msrs)
+               return;
 
-       if (family != 6)
-               return 0;
+       if (platform->trl_msrs & TRL_LIMIT2)
+               dump_turbo_ratio_limit2();
 
-       switch (model) {
-       case INTEL_FAM6_ATOM_GOLDMONT_D:
-               return 1;
-       }
-       return 0;
-}
+       if (platform->trl_msrs & TRL_LIMIT1)
+               dump_turbo_ratio_limit1();
 
-int is_bdx(unsigned int family, unsigned int model)
-{
+       if (platform->trl_msrs & TRL_BASE) {
+               dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT);
 
-       if (!genuine_intel)
-               return 0;
+               if (is_hybrid)
+                       dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT);
+       }
 
-       if (family != 6)
-               return 0;
+       if (platform->trl_msrs & TRL_ATOM)
+               dump_atom_turbo_ratio_limits();
 
-       switch (model) {
-       case INTEL_FAM6_BROADWELL_X:
-               return 1;
-       }
-       return 0;
+       if (platform->trl_msrs & TRL_KNL)
+               dump_knl_turbo_ratio_limits();
+
+       if (platform->has_config_tdp)
+               dump_config_tdp();
 }
 
-int is_skx(unsigned int family, unsigned int model)
+static int read_sysfs_int(char *path)
 {
+       FILE *input;
+       int retval = -1;
 
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_SKYLAKE_X:
-               return 1;
+       input = fopen(path, "r");
+       if (input == NULL) {
+               if (debug)
+                       fprintf(outf, "NSFOD %s\n", path);
+               return (-1);
        }
-       return 0;
+       if (fscanf(input, "%d", &retval) != 1)
+               err(1, "%s: failed to read int from file", path);
+       fclose(input);
+
+       return (retval);
 }
 
-int is_icx(unsigned int family, unsigned int model)
+static void dump_sysfs_file(char *path)
 {
+       FILE *input;
+       char cpuidle_buf[64];
 
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_ICELAKE_X:
-               return 1;
+       input = fopen(path, "r");
+       if (input == NULL) {
+               if (debug)
+                       fprintf(outf, "NSFOD %s\n", path);
+               return;
        }
-       return 0;
+       if (!fgets(cpuidle_buf, sizeof(cpuidle_buf), input))
+               err(1, "%s: failed to read file", path);
+       fclose(input);
+
+       fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf);
 }
 
-int is_spr(unsigned int family, unsigned int model)
+static void probe_intel_uncore_frequency(void)
 {
+       int i, j;
+       char path[128];
 
        if (!genuine_intel)
-               return 0;
+               return;
 
-       if (family != 6)
-               return 0;
+       if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK))
+               return;
 
-       switch (model) {
-       case INTEL_FAM6_SAPPHIRERAPIDS_X:
-               return 1;
-       }
-       return 0;
-}
+       /* Cluster level sysfs not supported yet. */
+       if (!access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK))
+               return;
 
-int is_ehl(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
+       if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK))
+               BIC_PRESENT(BIC_UNCORE_MHZ);
 
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_ATOM_TREMONT:
-               return 1;
-       }
-       return 0;
-}
-
-int is_jvl(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_ATOM_TREMONT_D:
-               return 1;
-       }
-       return 0;
-}
-
-int has_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-       if (has_slv_msrs(family, model))
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-               /* Nehalem compatible, but do not include turbo-ratio limit support */
-       case INTEL_FAM6_NEHALEM_EX:     /* Nehalem-EX Xeon - Beckton */
-       case INTEL_FAM6_XEON_PHI_KNL:   /* PHI - Knights Landing (different MSR definition) */
-               return 0;
-       default:
-               return 1;
-       }
-}
-
-int has_atom_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-       if (has_slv_msrs(family, model))
-               return 1;
-
-       return 0;
-}
-
-int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_IVYBRIDGE_X:    /* IVB Xeon */
-       case INTEL_FAM6_HASWELL_X:      /* HSW Xeon */
-               return 1;
-       default:
-               return 0;
-       }
-}
-
-int has_hsw_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_HASWELL_X:      /* HSW Xeon */
-               return 1;
-       default:
-               return 0;
-       }
-}
-
-int has_knl_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_XEON_PHI_KNL:   /* Knights Landing */
-               return 1;
-       default:
-               return 0;
-       }
-}
-
-int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_ATOM_GOLDMONT:
-       case INTEL_FAM6_SKYLAKE_X:
-       case INTEL_FAM6_ICELAKE_X:
-       case INTEL_FAM6_SAPPHIRERAPIDS_X:
-               return 1;
-       default:
-               return 0;
-       }
-}
-
-int has_config_tdp(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_IVYBRIDGE:      /* IVB */
-       case INTEL_FAM6_HASWELL:        /* HSW */
-       case INTEL_FAM6_HASWELL_X:      /* HSX */
-       case INTEL_FAM6_HASWELL_L:      /* HSW */
-       case INTEL_FAM6_HASWELL_G:      /* HSW */
-       case INTEL_FAM6_BROADWELL:      /* BDW */
-       case INTEL_FAM6_BROADWELL_G:    /* BDW */
-       case INTEL_FAM6_BROADWELL_X:    /* BDX */
-       case INTEL_FAM6_SKYLAKE_L:      /* SKL */
-       case INTEL_FAM6_CANNONLAKE_L:   /* CNL */
-       case INTEL_FAM6_SKYLAKE_X:      /* SKX */
-       case INTEL_FAM6_ICELAKE_X:      /* ICX */
-       case INTEL_FAM6_SAPPHIRERAPIDS_X:       /* SPR */
-       case INTEL_FAM6_XEON_PHI_KNL:   /* Knights Landing */
-               return 1;
-       default:
-               return 0;
-       }
-}
-
-/*
- * tcc_offset_bits:
- * 0: Tcc Offset not supported (Default)
- * 6: Bit 29:24 of MSR_PLATFORM_INFO
- * 4: Bit 27:24 of MSR_PLATFORM_INFO
- */
-void check_tcc_offset(int model)
-{
-       unsigned long long msr;
-
-       if (!genuine_intel)
-               return;
-
-       switch (model) {
-       case INTEL_FAM6_SKYLAKE_L:
-       case INTEL_FAM6_SKYLAKE:
-       case INTEL_FAM6_KABYLAKE_L:
-       case INTEL_FAM6_KABYLAKE:
-       case INTEL_FAM6_ICELAKE_L:
-       case INTEL_FAM6_ICELAKE:
-       case INTEL_FAM6_TIGERLAKE_L:
-       case INTEL_FAM6_TIGERLAKE:
-       case INTEL_FAM6_COMETLAKE:
-               if (!get_msr(base_cpu, MSR_PLATFORM_INFO, &msr)) {
-                       msr = (msr >> 30) & 1;
-                       if (msr)
-                               tcc_offset_bits = 6;
-               }
-               return;
-       default:
-               return;
-       }
-}
-
-static void remove_underbar(char *s)
-{
-       char *to = s;
-
-       while (*s) {
-               if (*s != '_')
-                       *to++ = *s;
-               s++;
-       }
-
-       *to = 0;
-}
-
-static void dump_turbo_ratio_info(unsigned int family, unsigned int model)
-{
-       if (!has_turbo)
-               return;
-
-       if (has_hsw_turbo_ratio_limit(family, model))
-               dump_hsw_turbo_ratio_limits();
-
-       if (has_ivt_turbo_ratio_limit(family, model))
-               dump_ivt_turbo_ratio_limits();
-
-       if (has_turbo_ratio_limit(family, model)) {
-               dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT, family, model);
-
-               if (is_hybrid)
-                       dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT, family, model);
-       }
-
-       if (has_atom_turbo_ratio_limit(family, model))
-               dump_atom_turbo_ratio_limits();
-
-       if (has_knl_turbo_ratio_limit(family, model))
-               dump_knl_turbo_ratio_limits();
-
-       if (has_config_tdp(family, model))
-               dump_config_tdp();
-}
-
-static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model)
-{
-       if (!do_nhm_platform_info)
-               return;
-
-       dump_nhm_platform_info();
-       dump_turbo_ratio_info(family, model);
-       dump_nhm_cst_cfg();
-}
-
-static int read_sysfs_int(char *path)
-{
-       FILE *input;
-       int retval = -1;
-
-       input = fopen(path, "r");
-       if (input == NULL) {
-               if (debug)
-                       fprintf(outf, "NSFOD %s\n", path);
-               return (-1);
-       }
-       if (fscanf(input, "%d", &retval) != 1)
-               err(1, "%s: failed to read int from file", path);
-       fclose(input);
-
-       return (retval);
-}
-
-static void dump_sysfs_file(char *path)
-{
-       FILE *input;
-       char cpuidle_buf[64];
-
-       input = fopen(path, "r");
-       if (input == NULL) {
-               if (debug)
-                       fprintf(outf, "NSFOD %s\n", path);
-               return;
-       }
-       if (!fgets(cpuidle_buf, sizeof(cpuidle_buf), input))
-               err(1, "%s: failed to read file", path);
-       fclose(input);
-
-       fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf);
-}
-
-static void intel_uncore_frequency_probe(void)
-{
-       int i, j;
-       char path[128];
-
-       if (!genuine_intel)
-               return;
-
-       if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK))
-               return;
-
-       if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK))
-               BIC_PRESENT(BIC_UNCORE_MHZ);
-
-       if (quiet)
-               return;
+       if (quiet)
+               return;
 
        for (i = 0; i < topo.num_packages; ++i) {
                for (j = 0; j < topo.num_die; ++j) {
@@ -4194,6 +4610,20 @@ static void intel_uncore_frequency_probe(void)
        }
 }
 
+static void probe_graphics(void)
+{
+       if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
+               BIC_PRESENT(BIC_GFX_rc6);
+
+       if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) ||
+           !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
+               BIC_PRESENT(BIC_GFXMHz);
+
+       if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) ||
+           !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
+               BIC_PRESENT(BIC_GFXACTMHz);
+}
+
 static void dump_sysfs_cstate_config(void)
 {
        char path[64];
@@ -4310,7 +4740,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
        cpu = t->cpu_id;
 
        /* EPB is per-package */
-       if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+       if (!is_cpu_first_thread_in_package(t, c, p))
                return 0;
 
        if (cpu_migrate(cpu)) {
@@ -4359,7 +4789,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
        cpu = t->cpu_id;
 
        /* MSR_HWP_CAPABILITIES is per-package */
-       if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+       if (!is_cpu_first_thread_in_package(t, c, p))
                return 0;
 
        if (cpu_migrate(cpu)) {
@@ -4442,7 +4872,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
        cpu = t->cpu_id;
 
        /* per-package */
-       if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+       if (!is_cpu_first_thread_in_package(t, c, p))
                return 0;
 
        if (cpu_migrate(cpu)) {
@@ -4450,7 +4880,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
                return -1;
        }
 
-       if (do_core_perf_limit_reasons) {
+       if (platform->plr_msrs & PLR_CORE) {
                get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr);
                fprintf(outf, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
                fprintf(outf, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
@@ -4483,7 +4913,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
                        (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : "");
 
        }
-       if (do_gfx_perf_limit_reasons) {
+       if (platform->plr_msrs & PLR_GFX) {
                get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr);
                fprintf(outf, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
                fprintf(outf, " (Active: %s%s%s%s%s%s%s%s)",
@@ -4503,7 +4933,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
                        (msr & 1 << 25) ? "GFXPwr, " : "",
                        (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
        }
-       if (do_ring_perf_limit_reasons) {
+       if (platform->plr_msrs & PLR_RING) {
                get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr);
                fprintf(outf, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
                fprintf(outf, " (Active: %s%s%s%s%s%s)",
@@ -4525,208 +4955,74 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 #define        RAPL_POWER_GRANULARITY  0x7FFF  /* 15 bit power granularity */
 #define        RAPL_TIME_GRANULARITY   0x3F    /* 6 bit time granularity */
 
-double get_tdp_intel(unsigned int model)
+double get_quirk_tdp(void)
 {
-       unsigned long long msr;
-
-       if (do_rapl & RAPL_PKG_POWER_INFO)
-               if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr))
-                       return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
+       if (platform->rapl_quirk_tdp)
+               return platform->rapl_quirk_tdp;
 
-       switch (model) {
-       case INTEL_FAM6_ATOM_SILVERMONT:
-       case INTEL_FAM6_ATOM_SILVERMONT_D:
-               return 30.0;
-       default:
-               return 135.0;
-       }
+       return 135.0;
 }
 
-double get_tdp_amd(unsigned int family)
+double get_tdp_intel(void)
 {
-       UNUSED(family);
+       unsigned long long msr;
 
-       /* This is the max stock TDP of HEDT/Server Fam17h+ chips */
-       return 280.0;
+       if (platform->rapl_msrs & RAPL_PKG_POWER_INFO)
+               if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr))
+                       return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
+       return get_quirk_tdp();
 }
 
-/*
- * rapl_dram_energy_units_probe()
- * Energy units are either hard-coded, or come from RAPL Energy Unit MSR.
- */
-static double rapl_dram_energy_units_probe(int model, double rapl_energy_units)
+double get_tdp_amd(void)
 {
-       /* only called for genuine_intel, family 6 */
-
-       switch (model) {
-       case INTEL_FAM6_HASWELL_X:      /* HSX */
-       case INTEL_FAM6_BROADWELL_X:    /* BDX */
-       case INTEL_FAM6_SKYLAKE_X:      /* SKX */
-       case INTEL_FAM6_XEON_PHI_KNL:   /* KNL */
-       case INTEL_FAM6_ICELAKE_X:      /* ICX */
-               return (rapl_dram_energy_units = 15.3 / 1000000);
-       default:
-               return (rapl_energy_units);
-       }
+       return get_quirk_tdp();
 }
 
-void rapl_probe_intel(unsigned int family, unsigned int model)
+void rapl_probe_intel(void)
 {
        unsigned long long msr;
        unsigned int time_unit;
        double tdp;
 
-       if (family != 6)
-               return;
-
-       switch (model) {
-       case INTEL_FAM6_SANDYBRIDGE:
-       case INTEL_FAM6_IVYBRIDGE:
-       case INTEL_FAM6_HASWELL:        /* HSW */
-       case INTEL_FAM6_HASWELL_L:      /* HSW */
-       case INTEL_FAM6_HASWELL_G:      /* HSW */
-       case INTEL_FAM6_BROADWELL:      /* BDW */
-       case INTEL_FAM6_BROADWELL_G:    /* BDW */
-               do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO;
-               if (rapl_joules) {
-                       BIC_PRESENT(BIC_Pkg_J);
-                       BIC_PRESENT(BIC_Cor_J);
-                       BIC_PRESENT(BIC_GFX_J);
-               } else {
-                       BIC_PRESENT(BIC_PkgWatt);
-                       BIC_PRESENT(BIC_CorWatt);
-                       BIC_PRESENT(BIC_GFXWatt);
-               }
-               break;
-       case INTEL_FAM6_ATOM_GOLDMONT:  /* BXT */
-       case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-               do_rapl = RAPL_PKG | RAPL_PKG_POWER_INFO;
-               if (rapl_joules)
-                       BIC_PRESENT(BIC_Pkg_J);
-               else
-                       BIC_PRESENT(BIC_PkgWatt);
-               break;
-       case INTEL_FAM6_ATOM_TREMONT:   /* EHL */
-               do_rapl =
-                   RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS
-                   | RAPL_GFX | RAPL_PKG_POWER_INFO;
-               if (rapl_joules) {
-                       BIC_PRESENT(BIC_Pkg_J);
-                       BIC_PRESENT(BIC_Cor_J);
-                       BIC_PRESENT(BIC_RAM_J);
-                       BIC_PRESENT(BIC_GFX_J);
-               } else {
-                       BIC_PRESENT(BIC_PkgWatt);
-                       BIC_PRESENT(BIC_CorWatt);
-                       BIC_PRESENT(BIC_RAMWatt);
-                       BIC_PRESENT(BIC_GFXWatt);
-               }
-               break;
-       case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */
-               do_rapl = RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO;
-               BIC_PRESENT(BIC_PKG__);
-               if (rapl_joules)
-                       BIC_PRESENT(BIC_Pkg_J);
-               else
-                       BIC_PRESENT(BIC_PkgWatt);
-               break;
-       case INTEL_FAM6_SKYLAKE_L:      /* SKL */
-       case INTEL_FAM6_CANNONLAKE_L:   /* CNL */
-               do_rapl =
-                   RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS
-                   | RAPL_GFX | RAPL_PKG_POWER_INFO;
-               BIC_PRESENT(BIC_PKG__);
-               BIC_PRESENT(BIC_RAM__);
-               if (rapl_joules) {
+       if (rapl_joules) {
+               if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS)
                        BIC_PRESENT(BIC_Pkg_J);
+               if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS)
                        BIC_PRESENT(BIC_Cor_J);
+               if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS)
                        BIC_PRESENT(BIC_RAM_J);
+               if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS)
                        BIC_PRESENT(BIC_GFX_J);
-               } else {
+       } else {
+               if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS)
                        BIC_PRESENT(BIC_PkgWatt);
+               if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS)
                        BIC_PRESENT(BIC_CorWatt);
+               if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS)
                        BIC_PRESENT(BIC_RAMWatt);
+               if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS)
                        BIC_PRESENT(BIC_GFXWatt);
-               }
-               break;
-       case INTEL_FAM6_HASWELL_X:      /* HSX */
-       case INTEL_FAM6_BROADWELL_X:    /* BDX */
-       case INTEL_FAM6_SKYLAKE_X:      /* SKX */
-       case INTEL_FAM6_ICELAKE_X:      /* ICX */
-       case INTEL_FAM6_SAPPHIRERAPIDS_X:       /* SPR */
-       case INTEL_FAM6_XEON_PHI_KNL:   /* KNL */
-               do_rapl =
-                   RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS |
-                   RAPL_PKG_POWER_INFO;
+       }
+
+       if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS)
                BIC_PRESENT(BIC_PKG__);
+       if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS)
                BIC_PRESENT(BIC_RAM__);
-               if (rapl_joules) {
-                       BIC_PRESENT(BIC_Pkg_J);
-                       BIC_PRESENT(BIC_RAM_J);
-               } else {
-                       BIC_PRESENT(BIC_PkgWatt);
-                       BIC_PRESENT(BIC_RAMWatt);
-               }
-               break;
-       case INTEL_FAM6_SANDYBRIDGE_X:
-       case INTEL_FAM6_IVYBRIDGE_X:
-               do_rapl =
-                   RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS |
-                   RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO;
-               BIC_PRESENT(BIC_PKG__);
-               BIC_PRESENT(BIC_RAM__);
-               if (rapl_joules) {
-                       BIC_PRESENT(BIC_Pkg_J);
-                       BIC_PRESENT(BIC_Cor_J);
-                       BIC_PRESENT(BIC_RAM_J);
-               } else {
-                       BIC_PRESENT(BIC_PkgWatt);
-                       BIC_PRESENT(BIC_CorWatt);
-                       BIC_PRESENT(BIC_RAMWatt);
-               }
-               break;
-       case INTEL_FAM6_ATOM_SILVERMONT:        /* BYT */
-       case INTEL_FAM6_ATOM_SILVERMONT_D:      /* AVN */
-               do_rapl = RAPL_PKG | RAPL_CORES;
-               if (rapl_joules) {
-                       BIC_PRESENT(BIC_Pkg_J);
-                       BIC_PRESENT(BIC_Cor_J);
-               } else {
-                       BIC_PRESENT(BIC_PkgWatt);
-                       BIC_PRESENT(BIC_CorWatt);
-               }
-               break;
-       case INTEL_FAM6_ATOM_GOLDMONT_D:        /* DNV */
-               do_rapl =
-                   RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS |
-                   RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS;
-               BIC_PRESENT(BIC_PKG__);
-               BIC_PRESENT(BIC_RAM__);
-               if (rapl_joules) {
-                       BIC_PRESENT(BIC_Pkg_J);
-                       BIC_PRESENT(BIC_Cor_J);
-                       BIC_PRESENT(BIC_RAM_J);
-               } else {
-                       BIC_PRESENT(BIC_PkgWatt);
-                       BIC_PRESENT(BIC_CorWatt);
-                       BIC_PRESENT(BIC_RAMWatt);
-               }
-               break;
-       default:
-               return;
-       }
 
        /* units on package 0, verify later other packages match */
        if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr))
                return;
 
        rapl_power_units = 1.0 / (1 << (msr & 0xF));
-       if (model == INTEL_FAM6_ATOM_SILVERMONT)
+       if (platform->has_rapl_divisor)
                rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000;
        else
                rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
 
-       rapl_dram_energy_units = rapl_dram_energy_units_probe(model, rapl_energy_units);
+       if (platform->has_fixed_rapl_unit)
+               rapl_dram_energy_units = (15.3 / 1000000);
+       else
+               rapl_dram_energy_units = rapl_energy_units;
 
        time_unit = msr >> 16 & 0xF;
        if (time_unit == 0)
@@ -4734,32 +5030,18 @@ void rapl_probe_intel(unsigned int family, unsigned int model)
 
        rapl_time_units = 1.0 / (1 << (time_unit));
 
-       tdp = get_tdp_intel(model);
+       tdp = get_tdp_intel();
 
        rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
        if (!quiet)
                fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
 }
 
-void rapl_probe_amd(unsigned int family, unsigned int model)
+void rapl_probe_amd(void)
 {
        unsigned long long msr;
-       unsigned int eax, ebx, ecx, edx;
-       unsigned int has_rapl = 0;
        double tdp;
 
-       UNUSED(model);
-
-       if (max_extended_level >= 0x80000007) {
-               __cpuid(0x80000007, eax, ebx, ecx, edx);
-               /* RAPL (Fam 17h+) */
-               has_rapl = edx & (1 << 14);
-       }
-
-       if (!has_rapl || family < 0x17)
-               return;
-
-       do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY;
        if (rapl_joules) {
                BIC_PRESENT(BIC_Pkg_J);
                BIC_PRESENT(BIC_Cor_J);
@@ -4775,128 +5057,13 @@ void rapl_probe_amd(unsigned int family, unsigned int model)
        rapl_energy_units = ldexp(1.0, -(msr >> 8 & 0x1f));
        rapl_power_units = ldexp(1.0, -(msr & 0xf));
 
-       tdp = get_tdp_amd(family);
+       tdp = get_tdp_amd();
 
        rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
        if (!quiet)
                fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
 }
 
-/*
- * rapl_probe()
- *
- * sets do_rapl, rapl_power_units, rapl_energy_units, rapl_time_units
- */
-void rapl_probe(unsigned int family, unsigned int model)
-{
-       if (genuine_intel)
-               rapl_probe_intel(family, model);
-       if (authentic_amd || hygon_genuine)
-               rapl_probe_amd(family, model);
-}
-
-void perf_limit_reasons_probe(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return;
-
-       if (family != 6)
-               return;
-
-       switch (model) {
-       case INTEL_FAM6_HASWELL:        /* HSW */
-       case INTEL_FAM6_HASWELL_L:      /* HSW */
-       case INTEL_FAM6_HASWELL_G:      /* HSW */
-               do_gfx_perf_limit_reasons = 1;
-               /* FALLTHRU */
-       case INTEL_FAM6_HASWELL_X:      /* HSX */
-               do_core_perf_limit_reasons = 1;
-               do_ring_perf_limit_reasons = 1;
-       default:
-               return;
-       }
-}
-
-void automatic_cstate_conversion_probe(unsigned int family, unsigned int model)
-{
-       if (family != 6)
-               return;
-
-       switch (model) {
-       case INTEL_FAM6_BROADWELL_X:
-       case INTEL_FAM6_SKYLAKE_X:
-               has_automatic_cstate_conversion = 1;
-       }
-}
-
-void prewake_cstate_probe(unsigned int family, unsigned int model)
-{
-       if (is_icx(family, model) || is_spr(family, model))
-               dis_cstate_prewake = 1;
-}
-
-int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
-{
-       unsigned long long msr;
-       unsigned int dts, dts2;
-       int cpu;
-
-       UNUSED(c);
-       UNUSED(p);
-
-       if (!(do_dts || do_ptm))
-               return 0;
-
-       cpu = t->cpu_id;
-
-       /* DTS is per-core, no need to print for each thread */
-       if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE))
-               return 0;
-
-       if (cpu_migrate(cpu)) {
-               fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu);
-               return -1;
-       }
-
-       if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) {
-               if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
-                       return 0;
-
-               dts = (msr >> 16) & 0x7F;
-               fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts);
-
-               if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
-                       return 0;
-
-               dts = (msr >> 16) & 0x7F;
-               dts2 = (msr >> 8) & 0x7F;
-               fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
-                       cpu, msr, tj_max - dts, tj_max - dts2);
-       }
-
-       if (do_dts && debug) {
-               unsigned int resolution;
-
-               if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
-                       return 0;
-
-               dts = (msr >> 16) & 0x7F;
-               resolution = (msr >> 27) & 0xF;
-               fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
-                       cpu, msr, tj_max - dts, resolution);
-
-               if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
-                       return 0;
-
-               dts = (msr >> 16) & 0x7F;
-               dts2 = (msr >> 8) & 0x7F;
-               fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
-                       cpu, msr, tj_max - dts, tj_max - dts2);
-       }
-
-       return 0;
-}
-
 void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
 {
        fprintf(outf, "cpu%d: %s: %sabled (%0.3f Watts, %f sec, clamp %sabled)\n",
@@ -4918,11 +5085,11 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
        UNUSED(c);
        UNUSED(p);
 
-       if (!do_rapl)
+       if (!platform->rapl_msrs)
                return 0;
 
        /* RAPL counters are per package, so print only for 1st thread/package */
-       if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+       if (!is_cpu_first_thread_in_package(t, c, p))
                return 0;
 
        cpu = t->cpu_id;
@@ -4931,7 +5098,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                return -1;
        }
 
-       if (do_rapl & RAPL_AMD_F17H) {
+       if (platform->rapl_msrs & RAPL_AMD_F17H) {
                msr_name = "MSR_RAPL_PWR_UNIT";
                if (get_msr(cpu, MSR_RAPL_PWR_UNIT, &msr))
                        return -1;
@@ -4944,7 +5111,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
        fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr,
                rapl_power_units, rapl_energy_units, rapl_time_units);
 
-       if (do_rapl & RAPL_PKG_POWER_INFO) {
+       if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) {
 
                if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr))
                        return -5;
@@ -4957,7 +5124,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                        ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
 
        }
-       if (do_rapl & RAPL_PKG) {
+       if (platform->rapl_msrs & RAPL_PKG) {
 
                if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr))
                        return -9;
@@ -4981,7 +5148,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                        cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN");
        }
 
-       if (do_rapl & RAPL_DRAM_POWER_INFO) {
+       if (platform->rapl_msrs & RAPL_DRAM_POWER_INFO) {
                if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr))
                        return -6;
 
@@ -4992,7 +5159,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
                        ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
                        ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
        }
-       if (do_rapl & RAPL_DRAM) {
+       if (platform->rapl_msrs & RAPL_DRAM) {
                if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr))
                        return -9;
                fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
@@ -5000,20 +5167,20 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
                print_power_limit_msr(cpu, msr, "DRAM Limit");
        }
-       if (do_rapl & RAPL_CORE_POLICY) {
+       if (platform->rapl_msrs & RAPL_CORE_POLICY) {
                if (get_msr(cpu, MSR_PP0_POLICY, &msr))
                        return -7;
 
                fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
        }
-       if (do_rapl & RAPL_CORES_POWER_LIMIT) {
+       if (platform->rapl_msrs & RAPL_CORE_POWER_LIMIT) {
                if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
                        return -9;
                fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
                        cpu, msr, (msr >> 31) & 1 ? "" : "UN");
                print_power_limit_msr(cpu, msr, "Cores Limit");
        }
-       if (do_rapl & RAPL_GFX) {
+       if (platform->rapl_msrs & RAPL_GFX) {
                if (get_msr(cpu, MSR_PP1_POLICY, &msr))
                        return -8;
 
@@ -5029,217 +5196,24 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 }
 
 /*
- * SNB adds support for additional MSRs:
- *
- * MSR_PKG_C7_RESIDENCY            0x000003fa
- * MSR_CORE_C7_RESIDENCY           0x000003fe
- * MSR_PKG_C2_RESIDENCY            0x0000060d
- */
-
-int has_snb_msrs(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_SANDYBRIDGE:
-       case INTEL_FAM6_SANDYBRIDGE_X:
-       case INTEL_FAM6_IVYBRIDGE:      /* IVB */
-       case INTEL_FAM6_IVYBRIDGE_X:    /* IVB Xeon */
-       case INTEL_FAM6_HASWELL:        /* HSW */
-       case INTEL_FAM6_HASWELL_X:      /* HSW */
-       case INTEL_FAM6_HASWELL_L:      /* HSW */
-       case INTEL_FAM6_HASWELL_G:      /* HSW */
-       case INTEL_FAM6_BROADWELL:      /* BDW */
-       case INTEL_FAM6_BROADWELL_G:    /* BDW */
-       case INTEL_FAM6_BROADWELL_X:    /* BDX */
-       case INTEL_FAM6_SKYLAKE_L:      /* SKL */
-       case INTEL_FAM6_CANNONLAKE_L:   /* CNL */
-       case INTEL_FAM6_SKYLAKE_X:      /* SKX */
-       case INTEL_FAM6_ICELAKE_X:      /* ICX */
-       case INTEL_FAM6_SAPPHIRERAPIDS_X:       /* SPR */
-       case INTEL_FAM6_ATOM_GOLDMONT:  /* BXT */
-       case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-       case INTEL_FAM6_ATOM_GOLDMONT_D:        /* DNV */
-       case INTEL_FAM6_ATOM_TREMONT:   /* EHL */
-       case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */
-               return 1;
-       }
-       return 0;
-}
-
-/*
- * HSW ULT added support for C8/C9/C10 MSRs:
- *
- * MSR_PKG_C8_RESIDENCY                0x00000630
- * MSR_PKG_C9_RESIDENCY                0x00000631
- * MSR_PKG_C10_RESIDENCY       0x00000632
- *
- * MSR_PKGC8_IRTL              0x00000633
- * MSR_PKGC9_IRTL              0x00000634
- * MSR_PKGC10_IRTL             0x00000635
- *
- */
-int has_c8910_msrs(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_HASWELL_L:      /* HSW */
-       case INTEL_FAM6_BROADWELL:      /* BDW */
-       case INTEL_FAM6_SKYLAKE_L:      /* SKL */
-       case INTEL_FAM6_CANNONLAKE_L:   /* CNL */
-       case INTEL_FAM6_ATOM_GOLDMONT:  /* BXT */
-       case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-       case INTEL_FAM6_ATOM_TREMONT:   /* EHL */
-               return 1;
-       }
-       return 0;
-}
-
-/*
- * SKL adds support for additional MSRS:
+ * probe_rapl()
  *
- * MSR_PKG_WEIGHTED_CORE_C0_RES    0x00000658
- * MSR_PKG_ANY_CORE_C0_RES         0x00000659
- * MSR_PKG_ANY_GFXE_C0_RES         0x0000065A
- * MSR_PKG_BOTH_CORE_GFXE_C0_RES   0x0000065B
+ * sets rapl_power_units, rapl_energy_units, rapl_time_units
  */
-int has_skl_msrs(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_SKYLAKE_L:      /* SKL */
-       case INTEL_FAM6_CANNONLAKE_L:   /* CNL */
-               return 1;
-       }
-       return 0;
-}
-
-int is_slm(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_ATOM_SILVERMONT:        /* BYT */
-       case INTEL_FAM6_ATOM_SILVERMONT_D:      /* AVN */
-               return 1;
-       }
-       return 0;
-}
-
-int is_knl(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_XEON_PHI_KNL:   /* KNL */
-               return 1;
-       }
-       return 0;
-}
-
-int is_cnl(unsigned int family, unsigned int model)
-{
-       if (!genuine_intel)
-               return 0;
-
-       if (family != 6)
-               return 0;
-
-       switch (model) {
-       case INTEL_FAM6_CANNONLAKE_L:   /* CNL */
-               return 1;
-       }
-
-       return 0;
-}
-
-unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model)
-{
-       if (is_knl(family, model))
-               return 1024;
-       return 1;
-}
-
-#define SLM_BCLK_FREQS 5
-double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 };
-
-double slm_bclk(void)
-{
-       unsigned long long msr = 3;
-       unsigned int i;
-       double freq;
-
-       if (get_msr(base_cpu, MSR_FSB_FREQ, &msr))
-               fprintf(outf, "SLM BCLK: unknown\n");
-
-       i = msr & 0xf;
-       if (i >= SLM_BCLK_FREQS) {
-               fprintf(outf, "SLM BCLK[%d] invalid\n", i);
-               i = 3;
-       }
-       freq = slm_freq_table[i];
-
-       if (!quiet)
-               fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq);
-
-       return freq;
-}
-
-double discover_bclk(unsigned int family, unsigned int model)
+void probe_rapl(void)
 {
-       if (has_snb_msrs(family, model) || is_knl(family, model))
-               return 100.00;
-       else if (is_slm(family, model))
-               return slm_bclk();
-       else
-               return 133.33;
-}
-
-int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
-{
-       unsigned int eax, ebx, ecx, edx;
-
-       UNUSED(c);
-       UNUSED(p);
-
-       if (!genuine_intel)
-               return 0;
+       if (!platform->rapl_msrs)
+               return;
 
-       if (cpu_migrate(t->cpu_id)) {
-               fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id);
-               return -1;
-       }
+       if (genuine_intel)
+               rapl_probe_intel();
+       if (authentic_amd || hygon_genuine)
+               rapl_probe_amd();
 
-       if (max_level < 0x1a)
-               return 0;
+       if (quiet)
+               return;
 
-       __cpuid(0x1a, eax, ebx, ecx, edx);
-       eax = (eax >> 24) & 0xFF;
-       if (eax == 0x20)
-               t->is_atom = true;
-       return 0;
+       for_all_cpus(print_rapl, ODD_COUNTERS);
 }
 
 /*
@@ -5268,7 +5242,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
                return 0;
 
        /* this is a per-package concept */
-       if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
+       if (!is_cpu_first_thread_in_package(t, c, p))
                return 0;
 
        cpu = t->cpu_id;
@@ -5284,7 +5258,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
        }
 
        /* Temperature Target MSR is Nehalem and newer only */
-       if (!do_nhm_platform_info)
+       if (!platform->has_nhm_msrs)
                goto guess;
 
        if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
@@ -5293,34 +5267,134 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk
        tcc_default = (msr >> 16) & 0xFF;
 
        if (!quiet) {
-               switch (tcc_offset_bits) {
-               case 4:
-                       tcc_offset = (msr >> 24) & 0xF;
-                       fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
-                               cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
-                       break;
-               case 6:
-                       tcc_offset = (msr >> 24) & 0x3F;
+               int bits = platform->tcc_offset_bits;
+               unsigned long long enabled = 0;
+
+               if (bits && !get_msr(base_cpu, MSR_PLATFORM_INFO, &enabled))
+                       enabled = (enabled >> 30) & 1;
+
+               if (bits && enabled) {
+                       tcc_offset = (msr >> 24) & GENMASK(bits - 1, 0);
                        fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
                                cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
-                       break;
-               default:
+               } else {
                        fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default);
-                       break;
                }
        }
 
-       if (!tcc_default)
-               goto guess;
+       if (!tcc_default)
+               goto guess;
+
+       tj_max = tcc_default;
+
+       return 0;
+
+guess:
+       tj_max = TJMAX_DEFAULT;
+       fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", cpu, tj_max);
+
+       return 0;
+}
+
+int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+       unsigned long long msr;
+       unsigned int dts, dts2;
+       int cpu;
+
+       UNUSED(c);
+       UNUSED(p);
+
+       if (!(do_dts || do_ptm))
+               return 0;
+
+       cpu = t->cpu_id;
+
+       /* DTS is per-core, no need to print for each thread */
+       if (!is_cpu_first_thread_in_core(t, c, p))
+               return 0;
+
+       if (cpu_migrate(cpu)) {
+               fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu);
+               return -1;
+       }
+
+       if (do_ptm && is_cpu_first_core_in_package(t, c, p)) {
+               if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
+                       return 0;
+
+               dts = (msr >> 16) & 0x7F;
+               fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts);
+
+               if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
+                       return 0;
+
+               dts = (msr >> 16) & 0x7F;
+               dts2 = (msr >> 8) & 0x7F;
+               fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
+                       cpu, msr, tj_max - dts, tj_max - dts2);
+       }
+
+       if (do_dts && debug) {
+               unsigned int resolution;
+
+               if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
+                       return 0;
+
+               dts = (msr >> 16) & 0x7F;
+               resolution = (msr >> 27) & 0xF;
+               fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
+                       cpu, msr, tj_max - dts, resolution);
+
+               if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
+                       return 0;
+
+               dts = (msr >> 16) & 0x7F;
+               dts2 = (msr >> 8) & 0x7F;
+               fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
+                       cpu, msr, tj_max - dts, tj_max - dts2);
+       }
+
+       return 0;
+}
+
+void probe_thermal(void)
+{
+       if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
+               BIC_PRESENT(BIC_CORE_THROT_CNT);
+       else
+               BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
+
+       for_all_cpus(set_temperature_target, ODD_COUNTERS);
+
+       if (quiet)
+               return;
+
+       for_all_cpus(print_thermal, ODD_COUNTERS);
+}
+
+int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       UNUSED(c);
+       UNUSED(p);
 
-       tj_max = tcc_default;
+       if (!genuine_intel)
+               return 0;
 
-       return 0;
+       if (cpu_migrate(t->cpu_id)) {
+               fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id);
+               return -1;
+       }
 
-guess:
-       tj_max = TJMAX_DEFAULT;
-       fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", cpu, tj_max);
+       if (max_level < 0x1a)
+               return 0;
 
+       __cpuid(0x1a, eax, ebx, ecx, edx);
+       eax = (eax >> 24) & 0xFF;
+       if (eax == 0x20)
+               t->is_atom = true;
        return 0;
 }
 
@@ -5354,7 +5428,7 @@ void decode_misc_feature_control(void)
 {
        unsigned long long msr;
 
-       if (!has_misc_feature_control)
+       if (!platform->has_msr_misc_feature_control)
                return;
 
        if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr))
@@ -5375,10 +5449,7 @@ void decode_misc_pwr_mgmt_msr(void)
 {
        unsigned long long msr;
 
-       if (!do_nhm_platform_info)
-               return;
-
-       if (no_MSR_MISC_PWR_MGMT)
+       if (!platform->has_msr_misc_pwr_mgmt)
                return;
 
        if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr))
@@ -5397,6 +5468,9 @@ void decode_c6_demotion_policy_msr(void)
 {
        unsigned long long msr;
 
+       if (!platform->has_msr_c6_demotion_policy_config)
+               return;
+
        if (!get_msr(base_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr))
                fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n",
                        base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
@@ -5406,67 +5480,6 @@ void decode_c6_demotion_policy_msr(void)
                        base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
 }
 
-/*
- * When models are the same, for the purpose of turbostat, reuse
- */
-unsigned int intel_model_duplicates(unsigned int model)
-{
-
-       switch (model) {
-       case INTEL_FAM6_NEHALEM_EP:     /* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */
-       case INTEL_FAM6_NEHALEM:        /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */
-       case 0x1F:              /* Core i7 and i5 Processor - Nehalem */
-       case INTEL_FAM6_WESTMERE:       /* Westmere Client - Clarkdale, Arrandale */
-       case INTEL_FAM6_WESTMERE_EP:    /* Westmere EP - Gulftown */
-               return INTEL_FAM6_NEHALEM;
-
-       case INTEL_FAM6_NEHALEM_EX:     /* Nehalem-EX Xeon - Beckton */
-       case INTEL_FAM6_WESTMERE_EX:    /* Westmere-EX Xeon - Eagleton */
-               return INTEL_FAM6_NEHALEM_EX;
-
-       case INTEL_FAM6_XEON_PHI_KNM:
-               return INTEL_FAM6_XEON_PHI_KNL;
-
-       case INTEL_FAM6_BROADWELL_X:
-       case INTEL_FAM6_BROADWELL_D:    /* BDX-DE */
-               return INTEL_FAM6_BROADWELL_X;
-
-       case INTEL_FAM6_SKYLAKE_L:
-       case INTEL_FAM6_SKYLAKE:
-       case INTEL_FAM6_KABYLAKE_L:
-       case INTEL_FAM6_KABYLAKE:
-       case INTEL_FAM6_COMETLAKE_L:
-       case INTEL_FAM6_COMETLAKE:
-               return INTEL_FAM6_SKYLAKE_L;
-
-       case INTEL_FAM6_ICELAKE_L:
-       case INTEL_FAM6_ICELAKE_NNPI:
-       case INTEL_FAM6_TIGERLAKE_L:
-       case INTEL_FAM6_TIGERLAKE:
-       case INTEL_FAM6_ROCKETLAKE:
-       case INTEL_FAM6_LAKEFIELD:
-       case INTEL_FAM6_ALDERLAKE:
-       case INTEL_FAM6_ALDERLAKE_L:
-       case INTEL_FAM6_ATOM_GRACEMONT:
-       case INTEL_FAM6_RAPTORLAKE:
-       case INTEL_FAM6_RAPTORLAKE_P:
-       case INTEL_FAM6_RAPTORLAKE_S:
-       case INTEL_FAM6_METEORLAKE:
-       case INTEL_FAM6_METEORLAKE_L:
-               return INTEL_FAM6_CANNONLAKE_L;
-
-       case INTEL_FAM6_ATOM_TREMONT_L:
-               return INTEL_FAM6_ATOM_TREMONT;
-
-       case INTEL_FAM6_ICELAKE_D:
-               return INTEL_FAM6_ICELAKE_X;
-
-       case INTEL_FAM6_EMERALDRAPIDS_X:
-               return INTEL_FAM6_SAPPHIRERAPIDS_X;
-       }
-       return model;
-}
-
 void print_dev_latency(void)
 {
        char *path = "/dev/cpu_dma_latency";
@@ -5510,6 +5523,101 @@ void linux_perf_init(void)
        BIC_PRESENT(BIC_IPC);
 }
 
+void probe_cstates(void)
+{
+       probe_cst_limit();
+
+       if (platform->supported_cstates & CC1)
+               BIC_PRESENT(BIC_CPU_c1);
+
+       if (platform->supported_cstates & CC3)
+               BIC_PRESENT(BIC_CPU_c3);
+
+       if (platform->supported_cstates & CC6)
+               BIC_PRESENT(BIC_CPU_c6);
+
+       if (platform->supported_cstates & CC7)
+               BIC_PRESENT(BIC_CPU_c7);
+
+       if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2))
+               BIC_PRESENT(BIC_Pkgpc2);
+
+       if (platform->supported_cstates & PC3 && (pkg_cstate_limit >= PCL__3))
+               BIC_PRESENT(BIC_Pkgpc3);
+
+       if (platform->supported_cstates & PC6 && (pkg_cstate_limit >= PCL__6))
+               BIC_PRESENT(BIC_Pkgpc6);
+
+       if (platform->supported_cstates & PC7 && (pkg_cstate_limit >= PCL__7))
+               BIC_PRESENT(BIC_Pkgpc7);
+
+       if (platform->supported_cstates & PC8 && (pkg_cstate_limit >= PCL__8))
+               BIC_PRESENT(BIC_Pkgpc8);
+
+       if (platform->supported_cstates & PC9 && (pkg_cstate_limit >= PCL__9))
+               BIC_PRESENT(BIC_Pkgpc9);
+
+       if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10))
+               BIC_PRESENT(BIC_Pkgpc10);
+
+       if (platform->has_msr_module_c6_res_ms)
+               BIC_PRESENT(BIC_Mod_c6);
+
+       if (platform->has_ext_cst_msrs) {
+               BIC_PRESENT(BIC_Totl_c0);
+               BIC_PRESENT(BIC_Any_c0);
+               BIC_PRESENT(BIC_GFX_c0);
+               BIC_PRESENT(BIC_CPUGFX);
+       }
+
+       if (quiet)
+               return;
+
+       dump_power_ctl();
+       dump_cst_cfg();
+       decode_c6_demotion_policy_msr();
+       print_dev_latency();
+       dump_sysfs_cstate_config();
+       print_irtl();
+}
+
+void probe_lpi(void)
+{
+       if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
+               BIC_PRESENT(BIC_CPU_LPI);
+       else
+               BIC_NOT_PRESENT(BIC_CPU_LPI);
+
+       if (!access(sys_lpi_file_sysfs, R_OK)) {
+               sys_lpi_file = sys_lpi_file_sysfs;
+               BIC_PRESENT(BIC_SYS_LPI);
+       } else if (!access(sys_lpi_file_debugfs, R_OK)) {
+               sys_lpi_file = sys_lpi_file_debugfs;
+               BIC_PRESENT(BIC_SYS_LPI);
+       } else {
+               sys_lpi_file_sysfs = NULL;
+               BIC_NOT_PRESENT(BIC_SYS_LPI);
+       }
+
+}
+
+void probe_pstates(void)
+{
+       probe_bclk();
+
+       if (quiet)
+               return;
+
+       dump_platform_info();
+       dump_turbo_ratio_info();
+       dump_sysfs_pstate_config();
+       decode_misc_pwr_mgmt_msr();
+
+       for_all_cpus(print_hwp, ODD_COUNTERS);
+       for_all_cpus(print_epb, ODD_COUNTERS);
+       for_all_cpus(print_perf_limit, ODD_COUNTERS);
+}
+
 void process_cpuid()
 {
        unsigned int eax, ebx, ecx, edx;
@@ -5569,10 +5677,8 @@ void process_cpuid()
                        edx_flags & (1 << 22) ? "ACPI-TM" : "-",
                        edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-");
        }
-       if (genuine_intel) {
-               model_orig = model;
-               model = intel_model_duplicates(model);
-       }
+
+       probe_platform_features(family, model);
 
        if (!(edx_flags & (1 << 5)))
                errx(1, "CPUID: no MSR");
@@ -5656,26 +5762,12 @@ void process_cpuid()
                __cpuid(0x15, eax_crystal, ebx_tsc, crystal_hz, edx);
 
                if (ebx_tsc != 0) {
-
                        if (!quiet && (ebx != 0))
                                fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n",
                                        eax_crystal, ebx_tsc, crystal_hz);
 
                        if (crystal_hz == 0)
-                               switch (model) {
-                               case INTEL_FAM6_SKYLAKE_L:      /* SKL */
-                                       crystal_hz = 24000000;  /* 24.0 MHz */
-                                       break;
-                               case INTEL_FAM6_ATOM_GOLDMONT_D:        /* DNV */
-                                       crystal_hz = 25000000;  /* 25.0 MHz */
-                                       break;
-                               case INTEL_FAM6_ATOM_GOLDMONT:  /* BXT */
-                               case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
-                                       crystal_hz = 19200000;  /* 19.2 MHz */
-                                       break;
-                               default:
-                                       crystal_hz = 0;
-                               }
+                               crystal_hz = platform->crystal_freq;
 
                        if (crystal_hz) {
                                tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal;
@@ -5700,147 +5792,33 @@ void process_cpuid()
        }
 
        if (has_aperf)
-               aperf_mperf_multiplier = get_aperf_mperf_multiplier(family, model);
+               aperf_mperf_multiplier = platform->need_perf_multiplier ? 1024 : 1;
 
        BIC_PRESENT(BIC_IRQ);
        BIC_PRESENT(BIC_TSC_MHz);
+}
 
-       if (probe_nhm_msrs(family, model)) {
-               do_nhm_platform_info = 1;
-               BIC_PRESENT(BIC_CPU_c1);
-               BIC_PRESENT(BIC_CPU_c3);
-               BIC_PRESENT(BIC_CPU_c6);
-               BIC_PRESENT(BIC_SMI);
-       }
-       do_snb_cstates = has_snb_msrs(family, model);
-
-       if (do_snb_cstates)
-               BIC_PRESENT(BIC_CPU_c7);
-
-       do_irtl_snb = has_snb_msrs(family, model);
-       if (do_snb_cstates && (pkg_cstate_limit >= PCL__2))
-               BIC_PRESENT(BIC_Pkgpc2);
-       if (pkg_cstate_limit >= PCL__3)
-               BIC_PRESENT(BIC_Pkgpc3);
-       if (pkg_cstate_limit >= PCL__6)
-               BIC_PRESENT(BIC_Pkgpc6);
-       if (do_snb_cstates && (pkg_cstate_limit >= PCL__7))
-               BIC_PRESENT(BIC_Pkgpc7);
-       if (has_slv_msrs(family, model)) {
-               BIC_NOT_PRESENT(BIC_Pkgpc2);
-               BIC_NOT_PRESENT(BIC_Pkgpc3);
-               BIC_PRESENT(BIC_Pkgpc6);
-               BIC_NOT_PRESENT(BIC_Pkgpc7);
-               BIC_PRESENT(BIC_Mod_c6);
-               use_c1_residency_msr = 1;
-       }
-       if (is_jvl(family, model)) {
-               BIC_NOT_PRESENT(BIC_CPU_c3);
-               BIC_NOT_PRESENT(BIC_CPU_c7);
-               BIC_NOT_PRESENT(BIC_Pkgpc2);
-               BIC_NOT_PRESENT(BIC_Pkgpc3);
-               BIC_NOT_PRESENT(BIC_Pkgpc6);
-               BIC_NOT_PRESENT(BIC_Pkgpc7);
-       }
-       if (is_dnv(family, model)) {
-               BIC_PRESENT(BIC_CPU_c1);
-               BIC_NOT_PRESENT(BIC_CPU_c3);
-               BIC_NOT_PRESENT(BIC_Pkgpc3);
-               BIC_NOT_PRESENT(BIC_CPU_c7);
-               BIC_NOT_PRESENT(BIC_Pkgpc7);
-               use_c1_residency_msr = 1;
-       }
-       if (is_skx(family, model) || is_icx(family, model) || is_spr(family, model)) {
-               BIC_NOT_PRESENT(BIC_CPU_c3);
-               BIC_NOT_PRESENT(BIC_Pkgpc3);
-               BIC_NOT_PRESENT(BIC_CPU_c7);
-               BIC_NOT_PRESENT(BIC_Pkgpc7);
-       }
-       if (is_bdx(family, model)) {
-               BIC_NOT_PRESENT(BIC_CPU_c7);
-               BIC_NOT_PRESENT(BIC_Pkgpc7);
-       }
-       if (has_c8910_msrs(family, model)) {
-               if (pkg_cstate_limit >= PCL__8)
-                       BIC_PRESENT(BIC_Pkgpc8);
-               if (pkg_cstate_limit >= PCL__9)
-                       BIC_PRESENT(BIC_Pkgpc9);
-               if (pkg_cstate_limit >= PCL_10)
-                       BIC_PRESENT(BIC_Pkgpc10);
-       }
-       do_irtl_hsw = has_c8910_msrs(family, model);
-       if (has_skl_msrs(family, model)) {
-               BIC_PRESENT(BIC_Totl_c0);
-               BIC_PRESENT(BIC_Any_c0);
-               BIC_PRESENT(BIC_GFX_c0);
-               BIC_PRESENT(BIC_CPUGFX);
-       }
-       do_slm_cstates = is_slm(family, model);
-       do_knl_cstates = is_knl(family, model);
-
-       if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) || is_ehl(family, model))
-               BIC_NOT_PRESENT(BIC_CPU_c3);
-
-       if (!quiet)
-               decode_misc_pwr_mgmt_msr();
-
-       if (!quiet && has_slv_msrs(family, model))
-               decode_c6_demotion_policy_msr();
-
-       rapl_probe(family, model);
-       perf_limit_reasons_probe(family, model);
-       automatic_cstate_conversion_probe(family, model);
-
-       check_tcc_offset(model_orig);
-
-       if (!quiet)
-               dump_cstate_pstate_config_info(family, model);
-       intel_uncore_frequency_probe();
-
-       if (!quiet)
-               print_dev_latency();
-       if (!quiet)
-               dump_sysfs_cstate_config();
-       if (!quiet)
-               dump_sysfs_pstate_config();
+void probe_pm_features(void)
+{
+       probe_pstates();
 
-       if (has_skl_msrs(family, model) || is_ehl(family, model))
-               calculate_tsc_tweak();
+       probe_cstates();
 
-       if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
-               BIC_PRESENT(BIC_GFX_rc6);
+       probe_lpi();
 
-       if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
-               BIC_PRESENT(BIC_GFXMHz);
+       probe_intel_uncore_frequency();
 
-       if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
-               BIC_PRESENT(BIC_GFXACTMHz);
+       probe_graphics();
 
-       if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
-               BIC_PRESENT(BIC_CPU_LPI);
-       else
-               BIC_NOT_PRESENT(BIC_CPU_LPI);
+       probe_rapl();
 
-       if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
-               BIC_PRESENT(BIC_CORE_THROT_CNT);
-       else
-               BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
+       probe_thermal();
 
-       if (!access(sys_lpi_file_sysfs, R_OK)) {
-               sys_lpi_file = sys_lpi_file_sysfs;
-               BIC_PRESENT(BIC_SYS_LPI);
-       } else if (!access(sys_lpi_file_debugfs, R_OK)) {
-               sys_lpi_file = sys_lpi_file_debugfs;
-               BIC_PRESENT(BIC_SYS_LPI);
-       } else {
-               sys_lpi_file_sysfs = NULL;
-               BIC_NOT_PRESENT(BIC_SYS_LPI);
-       }
+       if (platform->has_nhm_msrs)
+               BIC_PRESENT(BIC_SMI);
 
        if (!quiet)
                decode_misc_feature_control();
-
-       return;
 }
 
 /*
@@ -5855,7 +5833,7 @@ int dir_filter(const struct dirent *dirp)
                return 0;
 }
 
-void topology_probe()
+void topology_probe(bool startup)
 {
        int i;
        int max_core_id = 0;
@@ -5888,14 +5866,62 @@ void topology_probe()
        for_all_proc_cpus(mark_cpu_present);
 
        /*
-        * Validate that all cpus in cpu_subset are also in cpu_present_set
+        * Allocate and initialize cpu_effective_set
+        */
+       cpu_effective_set = CPU_ALLOC((topo.max_cpu_num + 1));
+       if (cpu_effective_set == NULL)
+               err(3, "CPU_ALLOC");
+       cpu_effective_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
+       CPU_ZERO_S(cpu_effective_setsize, cpu_effective_set);
+       update_effective_set(startup);
+
+       /*
+        * Allocate and initialize cpu_allowed_set
+        */
+       cpu_allowed_set = CPU_ALLOC((topo.max_cpu_num + 1));
+       if (cpu_allowed_set == NULL)
+               err(3, "CPU_ALLOC");
+       cpu_allowed_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
+       CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set);
+
+       /*
+        * Validate and update cpu_allowed_set.
+        *
+        * Make sure all cpus in cpu_subset are also in cpu_present_set during startup.
+        * Give a warning when cpus in cpu_subset become unavailable at runtime.
+        * Give a warning when cpus are not effective because of cgroup setting.
+        *
+        * cpu_allowed_set is the intersection of cpu_present_set/cpu_effective_set/cpu_subset.
         */
        for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) {
-               if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset))
-                       if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set))
-                               err(1, "cpu%d not present", i);
+               if (cpu_subset && !CPU_ISSET_S(i, cpu_subset_size, cpu_subset))
+                       continue;
+
+               if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) {
+                       if (cpu_subset) {
+                               /* cpus in cpu_subset must be in cpu_present_set during startup */
+                               if (startup)
+                                       err(1, "cpu%d not present", i);
+                               else
+                                       fprintf(stderr, "cpu%d not present\n", i);
+                       }
+                       continue;
+               }
+
+               if (CPU_COUNT_S(cpu_effective_setsize, cpu_effective_set)) {
+                       if (!CPU_ISSET_S(i, cpu_effective_setsize, cpu_effective_set)) {
+                               fprintf(stderr, "cpu%d not effective\n", i);
+                               continue;
+                       }
+               }
+
+               CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
        }
 
+       if (!CPU_COUNT_S(cpu_allowed_setsize, cpu_allowed_set))
+               err(-ENODEV, "No valid cpus found");
+       sched_setaffinity(0, cpu_allowed_setsize, cpu_allowed_set);
+
        /*
         * Allocate and initialize cpu_affinity_set
         */
@@ -6009,15 +6035,19 @@ void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_
        if (*c == NULL)
                goto error;
 
-       for (i = 0; i < num_cores; i++)
+       for (i = 0; i < num_cores; i++) {
                (*c)[i].core_id = -1;
+               (*c)[i].base_cpu = -1;
+       }
 
        *p = calloc(topo.num_packages, sizeof(struct pkg_data));
        if (*p == NULL)
                goto error;
 
-       for (i = 0; i < topo.num_packages; i++)
+       for (i = 0; i < topo.num_packages; i++) {
                (*p)[i].package_id = i;
+               (*p)[i].base_cpu = -1;
+       }
 
        return;
 error:
@@ -6050,10 +6080,11 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
        p = GET_PKG(pkg_base, pkg_id);
 
        t->cpu_id = cpu_id;
-       if (thread_id == 0) {
-               t->flags |= CPU_IS_FIRST_THREAD_IN_CORE;
-               if (cpu_is_first_core_in_package(cpu_id))
-                       t->flags |= CPU_IS_FIRST_CORE_IN_PACKAGE;
+       if (!cpu_is_not_allowed(cpu_id)) {
+               if (c->base_cpu < 0)
+                       c->base_cpu = t->cpu_id;
+               if (p->base_cpu < 0)
+                       p->base_cpu = t->cpu_id;
        }
 
        c->core_id = core_id;
@@ -6093,59 +6124,64 @@ void allocate_irq_buffers(void)
                err(-1, "calloc %d", topo.max_cpu_num + 1);
 }
 
-void setup_all_buffers(void)
+int update_topo(struct thread_data *t, struct core_data *c, struct pkg_data *p)
+{
+       topo.allowed_cpus++;
+       if ((int)t->cpu_id == c->base_cpu)
+               topo.allowed_cores++;
+       if ((int)t->cpu_id == p->base_cpu)
+               topo.allowed_packages++;
+
+       return 0;
+}
+
+void topology_update(void)
+{
+       topo.allowed_cpus = 0;
+       topo.allowed_cores = 0;
+       topo.allowed_packages = 0;
+       for_all_cpus(update_topo, ODD_COUNTERS);
+}
+void setup_all_buffers(bool startup)
 {
-       topology_probe();
+       topology_probe(startup);
        allocate_irq_buffers();
        allocate_fd_percpu();
        allocate_counters(&thread_even, &core_even, &package_even);
        allocate_counters(&thread_odd, &core_odd, &package_odd);
        allocate_output_buffer();
        for_all_proc_cpus(initialize_counters);
+       topology_update();
 }
 
 void set_base_cpu(void)
 {
-       base_cpu = sched_getcpu();
-       if (base_cpu < 0)
-               err(-ENODEV, "No valid cpus found");
+       int i;
 
-       if (debug > 1)
-               fprintf(outf, "base_cpu = %d\n", base_cpu);
+       for (i = 0; i < topo.max_cpu_num + 1; ++i) {
+               if (cpu_is_not_allowed(i))
+                       continue;
+               base_cpu = i;
+               if (debug > 1)
+                       fprintf(outf, "base_cpu = %d\n", base_cpu);
+               return;
+       }
+       err(-ENODEV, "No valid cpus found");
 }
 
 void turbostat_init()
 {
-       setup_all_buffers();
+       setup_all_buffers(true);
        set_base_cpu();
        check_dev_msr();
        check_permissions();
        process_cpuid();
+       probe_pm_features();
        linux_perf_init();
 
-       if (!quiet)
-               for_all_cpus(print_hwp, ODD_COUNTERS);
-
-       if (!quiet)
-               for_all_cpus(print_epb, ODD_COUNTERS);
-
-       if (!quiet)
-               for_all_cpus(print_perf_limit, ODD_COUNTERS);
-
-       if (!quiet)
-               for_all_cpus(print_rapl, ODD_COUNTERS);
-
-       for_all_cpus(set_temperature_target, ODD_COUNTERS);
-
        for_all_cpus(get_cpu_type, ODD_COUNTERS);
        for_all_cpus(get_cpu_type, EVEN_COUNTERS);
 
-       if (!quiet)
-               for_all_cpus(print_thermal, ODD_COUNTERS);
-
-       if (!quiet && do_irtl_snb)
-               print_irtl();
-
        if (DO_BIC(BIC_IPC))
                (void)get_instr_count_fd(base_cpu);
 }
@@ -6160,8 +6196,6 @@ int fork_it(char **argv)
        first_counter_read = 0;
        if (status)
                exit(status);
-       /* clear affinity side-effect of get_counters() */
-       sched_setaffinity(0, cpu_present_setsize, cpu_present_set);
        gettimeofday(&tv_even, (struct timezone *)NULL);
 
        child_pid = fork();
@@ -6225,7 +6259,7 @@ int get_and_dump_counters(void)
 
 void print_version()
 {
-       fprintf(outf, "turbostat version 2023.03.17 - Len Brown <lenb@kernel.org>\n");
+       fprintf(outf, "turbostat version 2023.11.07 - Len Brown <lenb@kernel.org>\n");
 }
 
 #define COMMAND_LINE_SIZE 2048
@@ -6508,9 +6542,6 @@ void probe_sysfs(void)
  */
 void parse_cpu_command(char *optarg)
 {
-       unsigned int start, end;
-       char *next;
-
        if (!strcmp(optarg, "core")) {
                if (cpu_subset)
                        goto error;
@@ -6533,52 +6564,8 @@ void parse_cpu_command(char *optarg)
 
        CPU_ZERO_S(cpu_subset_size, cpu_subset);
 
-       next = optarg;
-
-       while (next && *next) {
-
-               if (*next == '-')       /* no negative cpu numbers */
-                       goto error;
-
-               start = strtoul(next, &next, 10);
-
-               if (start >= CPU_SUBSET_MAXCPUS)
-                       goto error;
-               CPU_SET_S(start, cpu_subset_size, cpu_subset);
-
-               if (*next == '\0')
-                       break;
-
-               if (*next == ',') {
-                       next += 1;
-                       continue;
-               }
-
-               if (*next == '-') {
-                       next += 1;      /* start range */
-               } else if (*next == '.') {
-                       next += 1;
-                       if (*next == '.')
-                               next += 1;      /* start range */
-                       else
-                               goto error;
-               }
-
-               end = strtoul(next, &next, 10);
-               if (end <= start)
-                       goto error;
-
-               while (++start <= end) {
-                       if (start >= CPU_SUBSET_MAXCPUS)
-                               goto error;
-                       CPU_SET_S(start, cpu_subset_size, cpu_subset);
-               }
-
-               if (*next == ',')
-                       next += 1;
-               else if (*next != '\0')
-                       goto error;
-       }
+       if (parse_cpu_str(optarg, cpu_subset, cpu_subset_size))
+               goto error;
 
        return;
 
@@ -6719,6 +6706,19 @@ void cmdline(int argc, char **argv)
 
 int main(int argc, char **argv)
 {
+       int fd, ret;
+
+       fd = open("/sys/fs/cgroup/cgroup.procs", O_WRONLY);
+       if (fd < 0)
+               goto skip_cgroup_setting;
+
+       ret = write(fd, "0\n", 2);
+       if (ret == -1)
+               perror("Can't update cgroup\n");
+
+       close(fd);
+
+skip_cgroup_setting:
        outf = stderr;
        cmdline(argc, argv);
 
index df7697b..c1f55e1 100644 (file)
@@ -97,4 +97,66 @@ l0_%=:       r2 = r0;                                        \
 "      ::: __clobber_all);
 }
 
+SEC("socket")
+__description("conditional loop (2)")
+__success
+__failure_unpriv __msg_unpriv("back-edge from insn 10 to 11")
+__naked void conditional_loop2(void)
+{
+       asm volatile ("                                 \
+       r9 = 2 ll;                                      \
+       r3 = 0x20 ll;                                   \
+       r4 = 0x35 ll;                                   \
+       r8 = r4;                                        \
+       goto l1_%=;                                     \
+l0_%=: r9 -= r3;                                       \
+       r9 -= r4;                                       \
+       r9 -= r8;                                       \
+l1_%=: r8 += r4;                                       \
+       if r8 < 0x64 goto l0_%=;                        \
+       r0 = r9;                                        \
+       exit;                                           \
+"      ::: __clobber_all);
+}
+
+SEC("socket")
+__description("unconditional loop after conditional jump")
+__failure __msg("infinite loop detected")
+__failure_unpriv __msg_unpriv("back-edge from insn 3 to 2")
+__naked void uncond_loop_after_cond_jmp(void)
+{
+       asm volatile ("                                 \
+       r0 = 0;                                         \
+       if r0 > 0 goto l1_%=;                           \
+l0_%=: r0 = 1;                                         \
+       goto l0_%=;                                     \
+l1_%=: exit;                                           \
+"      ::: __clobber_all);
+}
+
+
+__naked __noinline __used
+static unsigned long never_ending_subprog()
+{
+       asm volatile ("                                 \
+       r0 = r1;                                        \
+       goto -1;                                        \
+"      ::: __clobber_all);
+}
+
+SEC("socket")
+__description("unconditional loop after conditional jump")
+/* infinite loop is detected *after* check_cfg() */
+__failure __msg("infinite loop detected")
+__naked void uncond_loop_in_subprog_after_cond_jmp(void)
+{
+       asm volatile ("                                 \
+       r0 = 0;                                         \
+       if r0 > 0 goto l1_%=;                           \
+l0_%=: r0 += 1;                                        \
+       call never_ending_subprog;                      \
+l1_%=: exit;                                           \
+"      ::: __clobber_all);
+}
+
 char _license[] SEC("license") = "GPL";
index 5bc86af..71735db 100644 (file)
@@ -75,9 +75,10 @@ l0_%=:       r0 += 1;                                        \
 "      ::: __clobber_all);
 }
 
-SEC("tracepoint")
+SEC("socket")
 __description("bounded loop, start in the middle")
-__failure __msg("back-edge")
+__success
+__failure_unpriv __msg_unpriv("back-edge")
 __naked void loop_start_in_the_middle(void)
 {
        asm volatile ("                                 \
@@ -136,7 +137,9 @@ l0_%=:      exit;                                           \
 
 SEC("tracepoint")
 __description("bounded recursion")
-__failure __msg("back-edge")
+__failure
+/* verifier limitation in detecting max stack depth */
+__msg("the call stack of 8 frames is too deep !")
 __naked void bounded_recursion(void)
 {
        asm volatile ("                                 \
index 193c0f8..6b564d4 100644 (file)
@@ -91,3 +91,43 @@ __naked int bpf_end_bswap(void)
 }
 
 #endif /* v4 instruction */
+
+SEC("?raw_tp")
+__success __log_level(2)
+/*
+ * Without the bug fix there will be no history between "last_idx 3 first_idx 3"
+ * and "parent state regs=" lines. "R0_w=6" parts are here to help anchor
+ * expected log messages to the one specific mark_chain_precision operation.
+ *
+ * This is quite fragile: if verifier checkpointing heuristic changes, this
+ * might need adjusting.
+ */
+__msg("2: (07) r0 += 1                       ; R0_w=6")
+__msg("3: (35) if r0 >= 0xa goto pc+1")
+__msg("mark_precise: frame0: last_idx 3 first_idx 3 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r0 stack= before 2: (07) r0 += 1")
+__msg("mark_precise: frame0: regs=r0 stack= before 1: (07) r0 += 1")
+__msg("mark_precise: frame0: regs=r0 stack= before 4: (05) goto pc-4")
+__msg("mark_precise: frame0: regs=r0 stack= before 3: (35) if r0 >= 0xa goto pc+1")
+__msg("mark_precise: frame0: parent state regs= stack=:  R0_rw=P4")
+__msg("3: R0_w=6")
+__naked int state_loop_first_last_equal(void)
+{
+       asm volatile (
+               "r0 = 0;"
+       "l0_%=:"
+               "r0 += 1;"
+               "r0 += 1;"
+               /* every few iterations we'll have a checkpoint here with
+                * first_idx == last_idx, potentially confusing precision
+                * backtracking logic
+                */
+               "if r0 >= 10 goto l1_%=;"       /* checkpoint + mark_precise */
+               "goto l0_%=;"
+       "l1_%=:"
+               "exit;"
+               ::: __clobber_common
+       );
+}
+
+char _license[] SEC("license") = "GPL";
index 1bdf2b4..3d5cd51 100644 (file)
        BPF_EXIT_INSN(),
        },
        .prog_type = BPF_PROG_TYPE_TRACEPOINT,
-       .errstr = "back-edge from insn 0 to 0",
+       .errstr = "the call stack of 9 frames is too deep",
        .result = REJECT,
 },
 {
        BPF_EXIT_INSN(),
        },
        .prog_type = BPF_PROG_TYPE_TRACEPOINT,
-       .errstr = "back-edge",
+       .errstr = "the call stack of 9 frames is too deep",
        .result = REJECT,
 },
 {
        BPF_EXIT_INSN(),
        },
        .prog_type = BPF_PROG_TYPE_TRACEPOINT,
-       .errstr = "back-edge",
+       .errstr = "the call stack of 9 frames is too deep",
        .result = REJECT,
 },
 {
index f929790..78f19c2 100644 (file)
@@ -9,8 +9,8 @@
        BPF_MOV64_IMM(BPF_REG_0, 2),
        BPF_EXIT_INSN(),
        },
-       .errstr = "invalid BPF_LD_IMM insn",
-       .errstr_unpriv = "R1 pointer comparison",
+       .errstr = "jump into the middle of ldimm64 insn 1",
+       .errstr_unpriv = "jump into the middle of ldimm64 insn 1",
        .result = REJECT,
 },
 {
@@ -23,8 +23,8 @@
        BPF_LD_IMM64(BPF_REG_0, 1),
        BPF_EXIT_INSN(),
        },
-       .errstr = "invalid BPF_LD_IMM insn",
-       .errstr_unpriv = "R1 pointer comparison",
+       .errstr = "jump into the middle of ldimm64 insn 1",
+       .errstr_unpriv = "jump into the middle of ldimm64 insn 1",
        .result = REJECT,
 },
 {
index 591ca96..b604c57 100644 (file)
@@ -908,8 +908,9 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr)
        struct xdp_info *meta = data - sizeof(struct xdp_info);
 
        if (meta->count != pkt->pkt_nb) {
-               ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n",
-                              __func__, pkt->pkt_nb, meta->count);
+               ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n",
+                              __func__, pkt->pkt_nb,
+                              (unsigned long long)meta->count);
                return false;
        }
 
@@ -926,11 +927,13 @@ static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 exp
 
        if (addr >= umem->num_frames * umem->frame_size ||
            addr + len > umem->num_frames * umem->frame_size) {
-               ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len);
+               ksft_print_msg("Frag invalid addr: %llx len: %u\n",
+                              (unsigned long long)addr, len);
                return false;
        }
        if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) {
-               ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len);
+               ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n",
+                              (unsigned long long)addr, len);
                return false;
        }
 
@@ -1029,7 +1032,8 @@ static int complete_pkts(struct xsk_socket_info *xsk, int batch_size)
                        u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1);
 
                        ksft_print_msg("[%s] Too many packets completed\n", __func__);
-                       ksft_print_msg("Last completion address: %llx\n", addr);
+                       ksft_print_msg("Last completion address: %llx\n",
+                                      (unsigned long long)addr);
                        return TEST_FAILURE;
                }
 
@@ -1513,8 +1517,9 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject)
        }
 
        if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) {
-               ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n",
-                              __func__, stats.tx_invalid_descs,
+               ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n",
+                              __func__,
+                              (unsigned long long)stats.tx_invalid_descs,
                               ifobject->xsk->pkt_stream->nb_pkts);
                return TEST_FAILURE;
        }
index cc920c7..4ff10ea 100644 (file)
@@ -45,3 +45,4 @@ mdwe_test
 gup_longterm
 mkdirty
 va_high_addr_switch
+hugetlb_fault_after_madv
index 0161fb4..befab43 100644 (file)
@@ -94,19 +94,19 @@ int init_uffd(void)
 
        uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
        if (uffd == -1)
-               ksft_exit_fail_msg("uffd syscall failed\n");
+               return uffd;
 
        uffdio_api.api = UFFD_API;
        uffdio_api.features = UFFD_FEATURE_WP_UNPOPULATED | UFFD_FEATURE_WP_ASYNC |
                              UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
        if (ioctl(uffd, UFFDIO_API, &uffdio_api))
-               ksft_exit_fail_msg("UFFDIO_API\n");
+               return -1;
 
        if (!(uffdio_api.api & UFFDIO_REGISTER_MODE_WP) ||
            !(uffdio_api.features & UFFD_FEATURE_WP_UNPOPULATED) ||
            !(uffdio_api.features & UFFD_FEATURE_WP_ASYNC) ||
            !(uffdio_api.features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM))
-               ksft_exit_fail_msg("UFFDIO_API error %llu\n", uffdio_api.api);
+               return -1;
 
        return 0;
 }
@@ -1151,7 +1151,7 @@ int sanity_tests(void)
        /* 9. Memory mapped file */
        fd = open(__FILE__, O_RDONLY);
        if (fd < 0)
-               ksft_exit_fail_msg("%s Memory mapped file\n");
+               ksft_exit_fail_msg("%s Memory mapped file\n", __func__);
 
        ret = stat(__FILE__, &sbuf);
        if (ret < 0)
@@ -1159,7 +1159,7 @@ int sanity_tests(void)
 
        fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
        if (fmem == MAP_FAILED)
-               ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno));
+               ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
 
        tmp_buf = malloc(sbuf.st_size);
        memcpy(tmp_buf, fmem, sbuf.st_size);
@@ -1189,7 +1189,7 @@ int sanity_tests(void)
 
        fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
        if (fmem == MAP_FAILED)
-               ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno));
+               ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
 
        wp_init(fmem, buf_size);
        wp_addr_range(fmem, buf_size);
@@ -1479,6 +1479,10 @@ int main(void)
        struct stat sbuf;
 
        ksft_print_header();
+
+       if (init_uffd())
+               return ksft_exit_pass();
+
        ksft_set_plan(115);
 
        page_size = getpagesize();
@@ -1488,9 +1492,6 @@ int main(void)
        if (pagemap_fd < 0)
                return -EINVAL;
 
-       if (init_uffd())
-               ksft_exit_fail_msg("uffd init failed\n");
-
        /* 1. Sanity testing */
        sanity_tests_sd();
 
@@ -1595,7 +1596,7 @@ int main(void)
 
        fmem = mmap(NULL, sbuf.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
        if (fmem == MAP_FAILED)
-               ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno));
+               ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
 
        wp_init(fmem, sbuf.st_size);
        wp_addr_range(fmem, sbuf.st_size);
@@ -1623,7 +1624,7 @@ int main(void)
 
        fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
        if (fmem == MAP_FAILED)
-               ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno));
+               ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
 
        wp_init(fmem, buf_size);
        wp_addr_range(fmem, buf_size);
index cc16f6c..0075744 100755 (executable)
@@ -223,9 +223,12 @@ CATEGORY="hugetlb" run_test ./hugepage-mremap
 CATEGORY="hugetlb" run_test ./hugepage-vmemmap
 CATEGORY="hugetlb" run_test ./hugetlb-madvise
 
+nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages)
 # For this test, we need one and just one huge page
 echo 1 > /proc/sys/vm/nr_hugepages
 CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv
+# Restore the previous number of huge pages, since further tests rely on it
+echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages
 
 if test_selected "hugetlb"; then
        echo "NOTE: These hugetlb tests provide minimal coverage.  Use"
index 75a2438..3c94f2f 100755 (executable)
@@ -3240,7 +3240,7 @@ fastclose_tests()
        if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then
                test_linkfail=1024 fastclose=server \
                        run_tests $ns1 $ns2 10.0.1.1
-               chk_join_nr 0 0 0
+               chk_join_nr 0 0 0 0 0 0 1
                chk_fclose_nr 1 1 invert
                chk_rst_nr 1 1
        fi