Merge branch 'thermal-hfi'
authorRafael J. Wysocki <rafael.j.wysocki@intel.com>
Fri, 18 Mar 2022 18:00:26 +0000 (19:00 +0100)
committerRafael J. Wysocki <rafael.j.wysocki@intel.com>
Fri, 18 Mar 2022 18:00:26 +0000 (19:00 +0100)
Merge Intel Hardware Feedback Interface (HFI) thermal driver for
5.18-rc1 and update the intel-speed-select utility to support that
driver.

* thermal-hfi:
  tools/power/x86/intel-speed-select: v1.12 release
  tools/power/x86/intel-speed-select: HFI support
  tools/power/x86/intel-speed-select: OOB daemon mode
  thermal: intel: hfi: INTEL_HFI_THERMAL depends on NET
  thermal: netlink: Fix parameter type of thermal_genl_cpu_capability_event() stub
  thermal: intel: hfi: Notify user space for HFI events
  thermal: netlink: Add a new event to notify CPU capabilities change
  thermal: intel: hfi: Enable notification interrupt
  thermal: intel: hfi: Handle CPU hotplug events
  thermal: intel: hfi: Minimally initialize the Hardware Feedback Interface
  x86/cpu: Add definitions for the Intel Hardware Feedback Interface
  x86/Documentation: Describe the Intel Hardware Feedback Interface

18 files changed:
Documentation/x86/index.rst
Documentation/x86/intel-hfi.rst [new file with mode: 0644]
arch/x86/include/asm/cpufeatures.h
arch/x86/include/asm/msr-index.h
drivers/thermal/intel/Kconfig
drivers/thermal/intel/Makefile
drivers/thermal/intel/intel_hfi.c [new file with mode: 0644]
drivers/thermal/intel/intel_hfi.h [new file with mode: 0644]
drivers/thermal/intel/therm_throt.c
drivers/thermal/thermal_netlink.c
drivers/thermal/thermal_netlink.h
include/uapi/linux/thermal.h
tools/power/x86/intel-speed-select/Build
tools/power/x86/intel-speed-select/Makefile
tools/power/x86/intel-speed-select/hfi-events.c [new file with mode: 0644]
tools/power/x86/intel-speed-select/isst-config.c
tools/power/x86/intel-speed-select/isst-daemon.c [new file with mode: 0644]
tools/power/x86/intel-speed-select/isst.h

index f498f1d..982c8af 100644 (file)
@@ -21,6 +21,7 @@ x86-specific Documentation
    tlb
    mtrr
    pat
+   intel-hfi
    intel-iommu
    intel_txt
    amd-memory-encryption
diff --git a/Documentation/x86/intel-hfi.rst b/Documentation/x86/intel-hfi.rst
new file mode 100644 (file)
index 0000000..49dea58
--- /dev/null
@@ -0,0 +1,72 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================================================
+Hardware-Feedback Interface for scheduling on Intel Hardware
+============================================================
+
+Overview
+--------
+
+Intel has described the Hardware Feedback Interface (HFI) in the Intel 64 and
+IA-32 Architectures Software Developer's Manual (Intel SDM) Volume 3 Section
+14.6 [1]_.
+
+The HFI gives the operating system a performance and energy efficiency
+capability data for each CPU in the system. Linux can use the information from
+the HFI to influence task placement decisions.
+
+The Hardware Feedback Interface
+-------------------------------
+
+The Hardware Feedback Interface provides to the operating system information
+about the performance and energy efficiency of each CPU in the system. Each
+capability is given as a unit-less quantity in the range [0-255]. Higher values
+indicate higher capability. Energy efficiency and performance are reported in
+separate capabilities. Even though on some systems these two metrics may be
+related, they are specified as independent capabilities in the Intel SDM.
+
+These capabilities may change at runtime as a result of changes in the
+operating conditions of the system or the action of external factors. The rate
+at which these capabilities are updated is specific to each processor model. On
+some models, capabilities are set at boot time and never change. On others,
+capabilities may change every tens of milliseconds. For instance, a remote
+mechanism may be used to lower Thermal Design Power. Such change can be
+reflected in the HFI. Likewise, if the system needs to be throttled due to
+excessive heat, the HFI may reflect reduced performance on specific CPUs.
+
+The kernel or a userspace policy daemon can use these capabilities to modify
+task placement decisions. For instance, if either the performance or energy
+capabilities of a given logical processor becomes zero, it is an indication that
+the hardware recommends to the operating system to not schedule any tasks on
+that processor for performance or energy efficiency reasons, respectively.
+
+Implementation details for Linux
+--------------------------------
+
+The infrastructure to handle thermal event interrupts has two parts. In the
+Local Vector Table of a CPU's local APIC, there exists a register for the
+Thermal Monitor Register. This register controls how interrupts are delivered
+to a CPU when the thermal monitor generates and interrupt. Further details
+can be found in the Intel SDM Vol. 3 Section 10.5 [1]_.
+
+The thermal monitor may generate interrupts per CPU or per package. The HFI
+generates package-level interrupts. This monitor is configured and initialized
+via a set of machine-specific registers. Specifically, the HFI interrupt and
+status are controlled via designated bits in the IA32_PACKAGE_THERM_INTERRUPT
+and IA32_PACKAGE_THERM_STATUS registers, respectively. There exists one HFI
+table per package. Further details can be found in the Intel SDM Vol. 3
+Section 14.9 [1]_.
+
+The hardware issues an HFI interrupt after updating the HFI table and is ready
+for the operating system to consume it. CPUs receive such interrupt via the
+thermal entry in the Local APIC's Local Vector Table.
+
+When servicing such interrupt, the HFI driver parses the updated table and
+relays the update to userspace using the thermal notification framework. Given
+that there may be many HFI updates every second, the updates relayed to
+userspace are throttled at a rate of CONFIG_HZ jiffies.
+
+References
+----------
+
+.. [1] https://www.intel.com/sdm
index 6db4e29..b39f510 100644 (file)
 #define X86_FEATURE_HWP_ACT_WINDOW     (14*32+ 9) /* HWP Activity Window */
 #define X86_FEATURE_HWP_EPP            (14*32+10) /* HWP Energy Perf. Preference */
 #define X86_FEATURE_HWP_PKG_REQ                (14*32+11) /* HWP Package Level Request */
+#define X86_FEATURE_HFI                        (14*32+19) /* Hardware Feedback Interface */
 
 /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
 #define X86_FEATURE_NPT                        (15*32+ 0) /* Nested Page Table support */
index a4a39c3..0e7f303 100644 (file)
 
 #define PACKAGE_THERM_STATUS_PROCHOT           (1 << 0)
 #define PACKAGE_THERM_STATUS_POWER_LIMIT       (1 << 10)
+#define PACKAGE_THERM_STATUS_HFI_UPDATED       (1 << 26)
 
 #define MSR_IA32_PACKAGE_THERM_INTERRUPT       0x000001b2
 
 #define PACKAGE_THERM_INT_HIGH_ENABLE          (1 << 0)
 #define PACKAGE_THERM_INT_LOW_ENABLE           (1 << 1)
 #define PACKAGE_THERM_INT_PLN_ENABLE           (1 << 24)
+#define PACKAGE_THERM_INT_HFI_ENABLE           (1 << 25)
 
 /* Thermal Thresholds Support */
 #define THERM_INT_THRESHOLD0_ENABLE    (1 << 15)
 #define MSR_VM_IGNNE                    0xc0010115
 #define MSR_VM_HSAVE_PA                 0xc0010117
 
+/* Hardware Feedback Interface */
+#define MSR_IA32_HW_FEEDBACK_PTR        0x17d0
+#define MSR_IA32_HW_FEEDBACK_CONFIG     0x17d1
+
 #endif /* _ASM_X86_MSR_INDEX_H */
index c83ea5d..f0c8456 100644 (file)
@@ -99,3 +99,17 @@ config INTEL_MENLOW
          Intel Menlow platform.
 
          If unsure, say N.
+
+config INTEL_HFI_THERMAL
+       bool "Intel Hardware Feedback Interface"
+       depends on NET
+       depends on CPU_SUP_INTEL
+       depends on X86_THERMAL_VECTOR
+       select THERMAL_NETLINK
+       help
+         Select this option to enable the Hardware Feedback Interface. If
+         selected, hardware provides guidance to the operating system on
+         the performance and energy efficiency capabilities of each CPU.
+         These capabilities may change as a result of changes in the operating
+         conditions of the system such power and thermal limits. If selected,
+         the kernel relays updates in CPUs' capabilities to userspace.
index 960b562..9a8d805 100644 (file)
@@ -13,3 +13,4 @@ obj-$(CONFIG_INTEL_PCH_THERMAL)       += intel_pch_thermal.o
 obj-$(CONFIG_INTEL_TCC_COOLING)        += intel_tcc_cooling.o
 obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
 obj-$(CONFIG_INTEL_MENLOW)     += intel_menlow.o
+obj-$(CONFIG_INTEL_HFI_THERMAL) += intel_hfi.o
diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
new file mode 100644 (file)
index 0000000..730fd12
--- /dev/null
@@ -0,0 +1,569 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Hardware Feedback Interface Driver
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ *
+ * Authors: Aubrey Li <aubrey.li@linux.intel.com>
+ *          Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+ *
+ *
+ * The Hardware Feedback Interface provides a performance and energy efficiency
+ * capability information for each CPU in the system. Depending on the processor
+ * model, hardware may periodically update these capabilities as a result of
+ * changes in the operating conditions (e.g., power limits or thermal
+ * constraints). On other processor models, there is a single HFI update
+ * at boot.
+ *
+ * This file provides functionality to process HFI updates and relay these
+ * updates to userspace.
+ */
+
+#define pr_fmt(fmt)  "intel-hfi: " fmt
+
+#include <linux/bitops.h>
+#include <linux/cpufeature.h>
+#include <linux/cpumask.h>
+#include <linux/gfp.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/math.h>
+#include <linux/mutex.h>
+#include <linux/percpu-defs.h>
+#include <linux/printk.h>
+#include <linux/processor.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/topology.h>
+#include <linux/workqueue.h>
+
+#include <asm/msr.h>
+
+#include "../thermal_core.h"
+#include "intel_hfi.h"
+
+#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | \
+                                    BIT(9) | BIT(11) | BIT(26))
+
+/* Hardware Feedback Interface MSR configuration bits */
+#define HW_FEEDBACK_PTR_VALID_BIT              BIT(0)
+#define HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT      BIT(0)
+
+/* CPUID detection and enumeration definitions for HFI */
+
+#define CPUID_HFI_LEAF 6
+
+union hfi_capabilities {
+       struct {
+               u8      performance:1;
+               u8      energy_efficiency:1;
+               u8      __reserved:6;
+       } split;
+       u8 bits;
+};
+
+union cpuid6_edx {
+       struct {
+               union hfi_capabilities  capabilities;
+               u32                     table_pages:4;
+               u32                     __reserved:4;
+               s32                     index:16;
+       } split;
+       u32 full;
+};
+
+/**
+ * struct hfi_cpu_data - HFI capabilities per CPU
+ * @perf_cap:          Performance capability
+ * @ee_cap:            Energy efficiency capability
+ *
+ * Capabilities of a logical processor in the HFI table. These capabilities are
+ * unitless.
+ */
+struct hfi_cpu_data {
+       u8      perf_cap;
+       u8      ee_cap;
+} __packed;
+
+/**
+ * struct hfi_hdr - Header of the HFI table
+ * @perf_updated:      Hardware updated performance capabilities
+ * @ee_updated:                Hardware updated energy efficiency capabilities
+ *
+ * Properties of the data in an HFI table.
+ */
+struct hfi_hdr {
+       u8      perf_updated;
+       u8      ee_updated;
+} __packed;
+
+/**
+ * struct hfi_instance - Representation of an HFI instance (i.e., a table)
+ * @local_table:       Base of the local copy of the HFI table
+ * @timestamp:         Timestamp of the last update of the local table.
+ *                     Located at the base of the local table.
+ * @hdr:               Base address of the header of the local table
+ * @data:              Base address of the data of the local table
+ * @cpus:              CPUs represented in this HFI table instance
+ * @hw_table:          Pointer to the HFI table of this instance
+ * @update_work:       Delayed work to process HFI updates
+ * @table_lock:                Lock to protect acceses to the table of this instance
+ * @event_lock:                Lock to process HFI interrupts
+ *
+ * A set of parameters to parse and navigate a specific HFI table.
+ */
+struct hfi_instance {
+       union {
+               void                    *local_table;
+               u64                     *timestamp;
+       };
+       void                    *hdr;
+       void                    *data;
+       cpumask_var_t           cpus;
+       void                    *hw_table;
+       struct delayed_work     update_work;
+       raw_spinlock_t          table_lock;
+       raw_spinlock_t          event_lock;
+};
+
+/**
+ * struct hfi_features - Supported HFI features
+ * @nr_table_pages:    Size of the HFI table in 4KB pages
+ * @cpu_stride:                Stride size to locate the capability data of a logical
+ *                     processor within the table (i.e., row stride)
+ * @hdr_size:          Size of the table header
+ *
+ * Parameters and supported features that are common to all HFI instances
+ */
+struct hfi_features {
+       unsigned int    nr_table_pages;
+       unsigned int    cpu_stride;
+       unsigned int    hdr_size;
+};
+
+/**
+ * struct hfi_cpu_info - Per-CPU attributes to consume HFI data
+ * @index:             Row of this CPU in its HFI table
+ * @hfi_instance:      Attributes of the HFI table to which this CPU belongs
+ *
+ * Parameters to link a logical processor to an HFI table and a row within it.
+ */
+struct hfi_cpu_info {
+       s16                     index;
+       struct hfi_instance     *hfi_instance;
+};
+
+static DEFINE_PER_CPU(struct hfi_cpu_info, hfi_cpu_info) = { .index = -1 };
+
+static int max_hfi_instances;
+static struct hfi_instance *hfi_instances;
+
+static struct hfi_features hfi_features;
+static DEFINE_MUTEX(hfi_instance_lock);
+
+static struct workqueue_struct *hfi_updates_wq;
+#define HFI_UPDATE_INTERVAL            HZ
+#define HFI_MAX_THERM_NOTIFY_COUNT     16
+
+static void get_hfi_caps(struct hfi_instance *hfi_instance,
+                        struct thermal_genl_cpu_caps *cpu_caps)
+{
+       int cpu, i = 0;
+
+       raw_spin_lock_irq(&hfi_instance->table_lock);
+       for_each_cpu(cpu, hfi_instance->cpus) {
+               struct hfi_cpu_data *caps;
+               s16 index;
+
+               index = per_cpu(hfi_cpu_info, cpu).index;
+               caps = hfi_instance->data + index * hfi_features.cpu_stride;
+               cpu_caps[i].cpu = cpu;
+
+               /*
+                * Scale performance and energy efficiency to
+                * the [0, 1023] interval that thermal netlink uses.
+                */
+               cpu_caps[i].performance = caps->perf_cap << 2;
+               cpu_caps[i].efficiency = caps->ee_cap << 2;
+
+               ++i;
+       }
+       raw_spin_unlock_irq(&hfi_instance->table_lock);
+}
+
+/*
+ * Call update_capabilities() when there are changes in the HFI table.
+ */
+static void update_capabilities(struct hfi_instance *hfi_instance)
+{
+       struct thermal_genl_cpu_caps *cpu_caps;
+       int i = 0, cpu_count;
+
+       /* CPUs may come online/offline while processing an HFI update. */
+       mutex_lock(&hfi_instance_lock);
+
+       cpu_count = cpumask_weight(hfi_instance->cpus);
+
+       /* No CPUs to report in this hfi_instance. */
+       if (!cpu_count)
+               goto out;
+
+       cpu_caps = kcalloc(cpu_count, sizeof(*cpu_caps), GFP_KERNEL);
+       if (!cpu_caps)
+               goto out;
+
+       get_hfi_caps(hfi_instance, cpu_caps);
+
+       if (cpu_count < HFI_MAX_THERM_NOTIFY_COUNT)
+               goto last_cmd;
+
+       /* Process complete chunks of HFI_MAX_THERM_NOTIFY_COUNT capabilities. */
+       for (i = 0;
+            (i + HFI_MAX_THERM_NOTIFY_COUNT) <= cpu_count;
+            i += HFI_MAX_THERM_NOTIFY_COUNT)
+               thermal_genl_cpu_capability_event(HFI_MAX_THERM_NOTIFY_COUNT,
+                                                 &cpu_caps[i]);
+
+       cpu_count = cpu_count - i;
+
+last_cmd:
+       /* Process the remaining capabilities if any. */
+       if (cpu_count)
+               thermal_genl_cpu_capability_event(cpu_count, &cpu_caps[i]);
+
+       kfree(cpu_caps);
+out:
+       mutex_unlock(&hfi_instance_lock);
+}
+
+static void hfi_update_work_fn(struct work_struct *work)
+{
+       struct hfi_instance *hfi_instance;
+
+       hfi_instance = container_of(to_delayed_work(work), struct hfi_instance,
+                                   update_work);
+       if (!hfi_instance)
+               return;
+
+       update_capabilities(hfi_instance);
+}
+
+void intel_hfi_process_event(__u64 pkg_therm_status_msr_val)
+{
+       struct hfi_instance *hfi_instance;
+       int cpu = smp_processor_id();
+       struct hfi_cpu_info *info;
+       u64 new_timestamp;
+
+       if (!pkg_therm_status_msr_val)
+               return;
+
+       info = &per_cpu(hfi_cpu_info, cpu);
+       if (!info)
+               return;
+
+       /*
+        * A CPU is linked to its HFI instance before the thermal vector in the
+        * local APIC is unmasked. Hence, info->hfi_instance cannot be NULL
+        * when receiving an HFI event.
+        */
+       hfi_instance = info->hfi_instance;
+       if (unlikely(!hfi_instance)) {
+               pr_debug("Received event on CPU %d but instance was null", cpu);
+               return;
+       }
+
+       /*
+        * On most systems, all CPUs in the package receive a package-level
+        * thermal interrupt when there is an HFI update. It is sufficient to
+        * let a single CPU to acknowledge the update and queue work to
+        * process it. The remaining CPUs can resume their work.
+        */
+       if (!raw_spin_trylock(&hfi_instance->event_lock))
+               return;
+
+       /* Skip duplicated updates. */
+       new_timestamp = *(u64 *)hfi_instance->hw_table;
+       if (*hfi_instance->timestamp == new_timestamp) {
+               raw_spin_unlock(&hfi_instance->event_lock);
+               return;
+       }
+
+       raw_spin_lock(&hfi_instance->table_lock);
+
+       /*
+        * Copy the updated table into our local copy. This includes the new
+        * timestamp.
+        */
+       memcpy(hfi_instance->local_table, hfi_instance->hw_table,
+              hfi_features.nr_table_pages << PAGE_SHIFT);
+
+       raw_spin_unlock(&hfi_instance->table_lock);
+       raw_spin_unlock(&hfi_instance->event_lock);
+
+       /*
+        * Let hardware know that we are done reading the HFI table and it is
+        * free to update it again.
+        */
+       pkg_therm_status_msr_val &= THERM_STATUS_CLEAR_PKG_MASK &
+                                   ~PACKAGE_THERM_STATUS_HFI_UPDATED;
+       wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, pkg_therm_status_msr_val);
+
+       queue_delayed_work(hfi_updates_wq, &hfi_instance->update_work,
+                          HFI_UPDATE_INTERVAL);
+}
+
+static void init_hfi_cpu_index(struct hfi_cpu_info *info)
+{
+       union cpuid6_edx edx;
+
+       /* Do not re-read @cpu's index if it has already been initialized. */
+       if (info->index > -1)
+               return;
+
+       edx.full = cpuid_edx(CPUID_HFI_LEAF);
+       info->index = edx.split.index;
+}
+
+/*
+ * The format of the HFI table depends on the number of capabilities that the
+ * hardware supports. Keep a data structure to navigate the table.
+ */
+static void init_hfi_instance(struct hfi_instance *hfi_instance)
+{
+       /* The HFI header is below the time-stamp. */
+       hfi_instance->hdr = hfi_instance->local_table +
+                           sizeof(*hfi_instance->timestamp);
+
+       /* The HFI data starts below the header. */
+       hfi_instance->data = hfi_instance->hdr + hfi_features.hdr_size;
+}
+
+/**
+ * intel_hfi_online() - Enable HFI on @cpu
+ * @cpu:       CPU in which the HFI will be enabled
+ *
+ * Enable the HFI to be used in @cpu. The HFI is enabled at the die/package
+ * level. The first CPU in the die/package to come online does the full HFI
+ * initialization. Subsequent CPUs will just link themselves to the HFI
+ * instance of their die/package.
+ *
+ * This function is called before enabling the thermal vector in the local APIC
+ * in order to ensure that @cpu has an associated HFI instance when it receives
+ * an HFI event.
+ */
+void intel_hfi_online(unsigned int cpu)
+{
+       struct hfi_instance *hfi_instance;
+       struct hfi_cpu_info *info;
+       phys_addr_t hw_table_pa;
+       u64 msr_val;
+       u16 die_id;
+
+       /* Nothing to do if hfi_instances are missing. */
+       if (!hfi_instances)
+               return;
+
+       /*
+        * Link @cpu to the HFI instance of its package/die. It does not
+        * matter whether the instance has been initialized.
+        */
+       info = &per_cpu(hfi_cpu_info, cpu);
+       die_id = topology_logical_die_id(cpu);
+       hfi_instance = info->hfi_instance;
+       if (!hfi_instance) {
+               if (die_id < 0 || die_id >= max_hfi_instances)
+                       return;
+
+               hfi_instance = &hfi_instances[die_id];
+               info->hfi_instance = hfi_instance;
+       }
+
+       init_hfi_cpu_index(info);
+
+       /*
+        * Now check if the HFI instance of the package/die of @cpu has been
+        * initialized (by checking its header). In such case, all we have to
+        * do is to add @cpu to this instance's cpumask.
+        */
+       mutex_lock(&hfi_instance_lock);
+       if (hfi_instance->hdr) {
+               cpumask_set_cpu(cpu, hfi_instance->cpus);
+               goto unlock;
+       }
+
+       /*
+        * Hardware is programmed with the physical address of the first page
+        * frame of the table. Hence, the allocated memory must be page-aligned.
+        */
+       hfi_instance->hw_table = alloc_pages_exact(hfi_features.nr_table_pages,
+                                                  GFP_KERNEL | __GFP_ZERO);
+       if (!hfi_instance->hw_table)
+               goto unlock;
+
+       hw_table_pa = virt_to_phys(hfi_instance->hw_table);
+
+       /*
+        * Allocate memory to keep a local copy of the table that
+        * hardware generates.
+        */
+       hfi_instance->local_table = kzalloc(hfi_features.nr_table_pages << PAGE_SHIFT,
+                                           GFP_KERNEL);
+       if (!hfi_instance->local_table)
+               goto free_hw_table;
+
+       /*
+        * Program the address of the feedback table of this die/package. On
+        * some processors, hardware remembers the old address of the HFI table
+        * even after having been reprogrammed and re-enabled. Thus, do not free
+        * the pages allocated for the table or reprogram the hardware with a
+        * new base address. Namely, program the hardware only once.
+        */
+       msr_val = hw_table_pa | HW_FEEDBACK_PTR_VALID_BIT;
+       wrmsrl(MSR_IA32_HW_FEEDBACK_PTR, msr_val);
+
+       init_hfi_instance(hfi_instance);
+
+       INIT_DELAYED_WORK(&hfi_instance->update_work, hfi_update_work_fn);
+       raw_spin_lock_init(&hfi_instance->table_lock);
+       raw_spin_lock_init(&hfi_instance->event_lock);
+
+       cpumask_set_cpu(cpu, hfi_instance->cpus);
+
+       /*
+        * Enable the hardware feedback interface and never disable it. See
+        * comment on programming the address of the table.
+        */
+       rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val);
+       msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT;
+       wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val);
+
+unlock:
+       mutex_unlock(&hfi_instance_lock);
+       return;
+
+free_hw_table:
+       free_pages_exact(hfi_instance->hw_table, hfi_features.nr_table_pages);
+       goto unlock;
+}
+
+/**
+ * intel_hfi_offline() - Disable HFI on @cpu
+ * @cpu:       CPU in which the HFI will be disabled
+ *
+ * Remove @cpu from those covered by its HFI instance.
+ *
+ * On some processors, hardware remembers previous programming settings even
+ * after being reprogrammed. Thus, keep HFI enabled even if all CPUs in the
+ * die/package of @cpu are offline. See note in intel_hfi_online().
+ */
+void intel_hfi_offline(unsigned int cpu)
+{
+       struct hfi_cpu_info *info = &per_cpu(hfi_cpu_info, cpu);
+       struct hfi_instance *hfi_instance;
+
+       /*
+        * Check if @cpu as an associated, initialized (i.e., with a non-NULL
+        * header). Also, HFI instances are only initialized if X86_FEATURE_HFI
+        * is present.
+        */
+       hfi_instance = info->hfi_instance;
+       if (!hfi_instance)
+               return;
+
+       if (!hfi_instance->hdr)
+               return;
+
+       mutex_lock(&hfi_instance_lock);
+       cpumask_clear_cpu(cpu, hfi_instance->cpus);
+       mutex_unlock(&hfi_instance_lock);
+}
+
+static __init int hfi_parse_features(void)
+{
+       unsigned int nr_capabilities;
+       union cpuid6_edx edx;
+
+       if (!boot_cpu_has(X86_FEATURE_HFI))
+               return -ENODEV;
+
+       /*
+        * If we are here we know that CPUID_HFI_LEAF exists. Parse the
+        * supported capabilities and the size of the HFI table.
+        */
+       edx.full = cpuid_edx(CPUID_HFI_LEAF);
+
+       if (!edx.split.capabilities.split.performance) {
+               pr_debug("Performance reporting not supported! Not using HFI\n");
+               return -ENODEV;
+       }
+
+       /*
+        * The number of supported capabilities determines the number of
+        * columns in the HFI table. Exclude the reserved bits.
+        */
+       edx.split.capabilities.split.__reserved = 0;
+       nr_capabilities = hweight8(edx.split.capabilities.bits);
+
+       /* The number of 4KB pages required by the table */
+       hfi_features.nr_table_pages = edx.split.table_pages + 1;
+
+       /*
+        * The header contains change indications for each supported feature.
+        * The size of the table header is rounded up to be a multiple of 8
+        * bytes.
+        */
+       hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities, 8) * 8;
+
+       /*
+        * Data of each logical processor is also rounded up to be a multiple
+        * of 8 bytes.
+        */
+       hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities, 8) * 8;
+
+       return 0;
+}
+
+void __init intel_hfi_init(void)
+{
+       struct hfi_instance *hfi_instance;
+       int i, j;
+
+       if (hfi_parse_features())
+               return;
+
+       /* There is one HFI instance per die/package. */
+       max_hfi_instances = topology_max_packages() *
+                           topology_max_die_per_package();
+
+       /*
+        * This allocation may fail. CPU hotplug callbacks must check
+        * for a null pointer.
+        */
+       hfi_instances = kcalloc(max_hfi_instances, sizeof(*hfi_instances),
+                               GFP_KERNEL);
+       if (!hfi_instances)
+               return;
+
+       for (i = 0; i < max_hfi_instances; i++) {
+               hfi_instance = &hfi_instances[i];
+               if (!zalloc_cpumask_var(&hfi_instance->cpus, GFP_KERNEL))
+                       goto err_nomem;
+       }
+
+       hfi_updates_wq = create_singlethread_workqueue("hfi-updates");
+       if (!hfi_updates_wq)
+               goto err_nomem;
+
+       return;
+
+err_nomem:
+       for (j = 0; j < i; ++j) {
+               hfi_instance = &hfi_instances[j];
+               free_cpumask_var(hfi_instance->cpus);
+       }
+
+       kfree(hfi_instances);
+       hfi_instances = NULL;
+}
diff --git a/drivers/thermal/intel/intel_hfi.h b/drivers/thermal/intel/intel_hfi.h
new file mode 100644 (file)
index 0000000..325aa78
--- /dev/null
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _INTEL_HFI_H
+#define _INTEL_HFI_H
+
+#if defined(CONFIG_INTEL_HFI_THERMAL)
+void __init intel_hfi_init(void);
+void intel_hfi_online(unsigned int cpu);
+void intel_hfi_offline(unsigned int cpu);
+void intel_hfi_process_event(__u64 pkg_therm_status_msr_val);
+#else
+static inline void intel_hfi_init(void) { }
+static inline void intel_hfi_online(unsigned int cpu) { }
+static inline void intel_hfi_offline(unsigned int cpu) { }
+static inline void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) { }
+#endif /* CONFIG_INTEL_HFI_THERMAL */
+
+#endif /* _INTEL_HFI_H */
index dab7e8f..8352083 100644 (file)
@@ -32,6 +32,7 @@
 #include <asm/irq.h>
 #include <asm/msr.h>
 
+#include "intel_hfi.h"
 #include "thermal_interrupt.h"
 
 /* How long to wait between reporting thermal events */
@@ -475,6 +476,13 @@ static int thermal_throttle_online(unsigned int cpu)
        INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
        INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);
 
+       /*
+        * The first CPU coming online will enable the HFI. Usually this causes
+        * hardware to issue an HFI thermal interrupt. Such interrupt will reach
+        * the CPU once we enable the thermal vector in the local APIC.
+        */
+       intel_hfi_online(cpu);
+
        /* Unmask the thermal vector after the above workqueues are initialized. */
        l = apic_read(APIC_LVTTHMR);
        apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
@@ -492,6 +500,8 @@ static int thermal_throttle_offline(unsigned int cpu)
        l = apic_read(APIC_LVTTHMR);
        apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED);
 
+       intel_hfi_offline(cpu);
+
        cancel_delayed_work_sync(&state->package_throttle.therm_work);
        cancel_delayed_work_sync(&state->core_throttle.therm_work);
 
@@ -509,6 +519,8 @@ static __init int thermal_throttle_init_device(void)
        if (!atomic_read(&therm_throt_en))
                return 0;
 
+       intel_hfi_init();
+
        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
                                thermal_throttle_online,
                                thermal_throttle_offline);
@@ -608,6 +620,10 @@ void intel_thermal_interrupt(void)
                                        PACKAGE_THERM_STATUS_POWER_LIMIT,
                                        POWER_LIMIT_EVENT,
                                        PACKAGE_LEVEL);
+
+               if (this_cpu_has(X86_FEATURE_HFI))
+                       intel_hfi_process_event(msr_val &
+                                               PACKAGE_THERM_STATUS_HFI_UPDATED);
        }
 }
 
@@ -717,6 +733,12 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
                        wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
                              l | (PACKAGE_THERM_INT_LOW_ENABLE
                                | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+
+               if (cpu_has(c, X86_FEATURE_HFI)) {
+                       rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+                       wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+                             l | PACKAGE_THERM_INT_HFI_ENABLE, h);
+               }
        }
 
        rdmsr(MSR_IA32_MISC_ENABLE, l, h);
index 73e68cc..32fea51 100644 (file)
@@ -43,6 +43,11 @@ static const struct nla_policy thermal_genl_policy[THERMAL_GENL_ATTR_MAX + 1] =
        [THERMAL_GENL_ATTR_CDEV_MAX_STATE]      = { .type = NLA_U32 },
        [THERMAL_GENL_ATTR_CDEV_NAME]           = { .type = NLA_STRING,
                                                    .len = THERMAL_NAME_LENGTH },
+       /* CPU capabilities */
+       [THERMAL_GENL_ATTR_CPU_CAPABILITY]              = { .type = NLA_NESTED },
+       [THERMAL_GENL_ATTR_CPU_CAPABILITY_ID]           = { .type = NLA_U32 },
+       [THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE]  = { .type = NLA_U32 },
+       [THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY]   = { .type = NLA_U32 },
 };
 
 struct param {
@@ -58,6 +63,8 @@ struct param {
        int temp;
        int cdev_state;
        int cdev_max_state;
+       struct thermal_genl_cpu_caps *cpu_capabilities;
+       int cpu_capabilities_count;
 };
 
 typedef int (*cb_t)(struct param *);
@@ -190,6 +197,42 @@ static int thermal_genl_event_gov_change(struct param *p)
        return 0;
 }
 
+static int thermal_genl_event_cpu_capability_change(struct param *p)
+{
+       struct thermal_genl_cpu_caps *cpu_cap = p->cpu_capabilities;
+       struct sk_buff *msg = p->msg;
+       struct nlattr *start_cap;
+       int i;
+
+       start_cap = nla_nest_start(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY);
+       if (!start_cap)
+               return -EMSGSIZE;
+
+       for (i = 0; i < p->cpu_capabilities_count; ++i) {
+               if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_ID,
+                               cpu_cap->cpu))
+                       goto out_cancel_nest;
+
+               if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE,
+                               cpu_cap->performance))
+                       goto out_cancel_nest;
+
+               if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY,
+                               cpu_cap->efficiency))
+                       goto out_cancel_nest;
+
+               ++cpu_cap;
+       }
+
+       nla_nest_end(msg, start_cap);
+
+       return 0;
+out_cancel_nest:
+       nla_nest_cancel(msg, start_cap);
+
+       return -EMSGSIZE;
+}
+
 int thermal_genl_event_tz_delete(struct param *p)
        __attribute__((alias("thermal_genl_event_tz")));
 
@@ -219,6 +262,7 @@ static cb_t event_cb[] = {
        [THERMAL_GENL_EVENT_CDEV_DELETE]        = thermal_genl_event_cdev_delete,
        [THERMAL_GENL_EVENT_CDEV_STATE_UPDATE]  = thermal_genl_event_cdev_state_update,
        [THERMAL_GENL_EVENT_TZ_GOV_CHANGE]      = thermal_genl_event_gov_change,
+       [THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE] = thermal_genl_event_cpu_capability_change,
 };
 
 /*
@@ -356,6 +400,15 @@ int thermal_notify_tz_gov_change(int tz_id, const char *name)
        return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_GOV_CHANGE, &p);
 }
 
+int thermal_genl_cpu_capability_event(int count,
+                                     struct thermal_genl_cpu_caps *caps)
+{
+       struct param p = { .cpu_capabilities_count = count, .cpu_capabilities = caps };
+
+       return thermal_genl_send_event(THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE, &p);
+}
+EXPORT_SYMBOL_GPL(thermal_genl_cpu_capability_event);
+
 /*************************** Command encoding ********************************/
 
 static int __thermal_genl_cmd_tz_get_id(struct thermal_zone_device *tz,
index e554f76..1052f52 100644 (file)
@@ -4,6 +4,12 @@
  *  Author: Daniel Lezcano <daniel.lezcano@linaro.org>
  */
 
+struct thermal_genl_cpu_caps {
+       int cpu;
+       int performance;
+       int efficiency;
+};
+
 /* Netlink notification function */
 #ifdef CONFIG_THERMAL_NETLINK
 int __init thermal_netlink_init(void);
@@ -23,6 +29,8 @@ int thermal_notify_cdev_add(int cdev_id, const char *name, int max_state);
 int thermal_notify_cdev_delete(int cdev_id);
 int thermal_notify_tz_gov_change(int tz_id, const char *name);
 int thermal_genl_sampling_temp(int id, int temp);
+int thermal_genl_cpu_capability_event(int count,
+                                     struct thermal_genl_cpu_caps *caps);
 #else
 static inline int thermal_netlink_init(void)
 {
@@ -101,4 +109,10 @@ static inline int thermal_genl_sampling_temp(int id, int temp)
 {
        return 0;
 }
+
+static inline int thermal_genl_cpu_capability_event(int count, struct thermal_genl_cpu_caps *caps)
+{
+       return 0;
+}
+
 #endif /* CONFIG_THERMAL_NETLINK */
index 9aa2fed..fc78bf3 100644 (file)
@@ -44,7 +44,10 @@ enum thermal_genl_attr {
        THERMAL_GENL_ATTR_CDEV_MAX_STATE,
        THERMAL_GENL_ATTR_CDEV_NAME,
        THERMAL_GENL_ATTR_GOV_NAME,
-
+       THERMAL_GENL_ATTR_CPU_CAPABILITY,
+       THERMAL_GENL_ATTR_CPU_CAPABILITY_ID,
+       THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE,
+       THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY,
        __THERMAL_GENL_ATTR_MAX,
 };
 #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1)
@@ -71,6 +74,7 @@ enum thermal_genl_event {
        THERMAL_GENL_EVENT_CDEV_DELETE,         /* Cdev unbound */
        THERMAL_GENL_EVENT_CDEV_STATE_UPDATE,   /* Cdev state updated */
        THERMAL_GENL_EVENT_TZ_GOV_CHANGE,       /* Governor policy changed  */
+       THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE,       /* CPU capability changed */
        __THERMAL_GENL_EVENT_MAX,
 };
 #define THERMAL_GENL_EVENT_MAX (__THERMAL_GENL_EVENT_MAX - 1)
index b61456d..81e36bd 100644 (file)
@@ -1 +1 @@
-intel-speed-select-y +=  isst-config.o isst-core.o isst-display.o
+intel-speed-select-y +=  isst-config.o isst-core.o isst-display.o isst-daemon.o hfi-events.o
index 12c6939..d2fba12 100644 (file)
@@ -13,8 +13,8 @@ endif
 # Do not use make's built-in rules
 # (this improves performance and avoids hard-to-debug behaviour);
 MAKEFLAGS += -r
-
-override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include
+override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include -I/usr/include/libnl3
+override LDFLAGS += -lnl-genl-3 -lnl-3
 
 ALL_TARGETS := intel-speed-select
 ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS))
@@ -31,7 +31,11 @@ $(OUTPUT)include/linux/isst_if.h: ../../../../include/uapi/linux/isst_if.h
        mkdir -p $(OUTPUT)include/linux 2>&1 || true
        ln -sf $(CURDIR)/../../../../include/uapi/linux/isst_if.h $@
 
-prepare: $(OUTPUT)include/linux/isst_if.h
+$(OUTPUT)include/linux/thermal.h: ../../../../include/uapi/linux/thermal.h
+       mkdir -p $(OUTPUT)include/linux 2>&1 || true
+       ln -sf $(CURDIR)/../../../../include/uapi/linux/thermal.h $@
+
+prepare: $(OUTPUT)include/linux/isst_if.h $(OUTPUT)include/linux/thermal.h
 
 ISST_IN := $(OUTPUT)intel-speed-select-in.o
 
diff --git a/tools/power/x86/intel-speed-select/hfi-events.c b/tools/power/x86/intel-speed-select/hfi-events.c
new file mode 100644 (file)
index 0000000..e856767
--- /dev/null
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Speed Select -- Read HFI events for OOB
+ * Copyright (c) 2022 Intel Corporation.
+ */
+
+/*
+ * This file incorporates work covered by the following copyright and
+ * permission notice:
+
+ * WPA Supplicant - driver interaction with Linux nl80211/cfg80211
+ * Copyright (c) 2003-2008, Jouni Malinen <j@w1.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Alternatively, this software may be distributed under the terms of
+ * BSD license.
+ *
+ * Requires
+ * libnl-genl-3-dev
+ *
+ * For Fedora/CenOS
+ * dnf install libnl3-devel
+ * For Ubuntu
+ * apt install libnl-3-dev libnl-genl-3-dev
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <getopt.h>
+#include <signal.h>
+#include <netlink/genl/genl.h>
+#include <netlink/genl/family.h>
+#include <netlink/genl/ctrl.h>
+
+#include <linux/thermal.h>
+#include "isst.h"
+
+struct hfi_event_data {
+       struct nl_sock *nl_handle;
+       struct nl_cb *nl_cb;
+};
+
+struct hfi_event_data drv;
+
+static int ack_handler(struct nl_msg *msg, void *arg)
+{
+       int *err = arg;
+       *err = 0;
+       return NL_STOP;
+}
+
+static int finish_handler(struct nl_msg *msg, void *arg)
+{
+       int *ret = arg;
+       *ret = 0;
+       return NL_SKIP;
+}
+
+static int error_handler(struct sockaddr_nl *nla, struct nlmsgerr *err,
+                        void *arg)
+{
+       int *ret = arg;
+       *ret = err->error;
+       return NL_SKIP;
+}
+
+static int seq_check_handler(struct nl_msg *msg, void *arg)
+{
+       return NL_OK;
+}
+
+static int send_and_recv_msgs(struct hfi_event_data *drv,
+                             struct nl_msg *msg,
+                             int (*valid_handler)(struct nl_msg *, void *),
+                             void *valid_data)
+{
+       struct nl_cb *cb;
+       int err = -ENOMEM;
+
+       cb = nl_cb_clone(drv->nl_cb);
+       if (!cb)
+               goto out;
+
+       err = nl_send_auto_complete(drv->nl_handle, msg);
+       if (err < 0)
+               goto out;
+
+       err = 1;
+
+       nl_cb_err(cb, NL_CB_CUSTOM, error_handler, &err);
+       nl_cb_set(cb, NL_CB_FINISH, NL_CB_CUSTOM, finish_handler, &err);
+       nl_cb_set(cb, NL_CB_ACK, NL_CB_CUSTOM, ack_handler, &err);
+
+       if (valid_handler)
+               nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM,
+                         valid_handler, valid_data);
+
+       while (err > 0)
+               nl_recvmsgs(drv->nl_handle, cb);
+ out:
+       nl_cb_put(cb);
+       nlmsg_free(msg);
+       return err;
+}
+
+struct family_data {
+       const char *group;
+       int id;
+};
+
+static int family_handler(struct nl_msg *msg, void *arg)
+{
+       struct family_data *res = arg;
+       struct nlattr *tb[CTRL_ATTR_MAX + 1];
+       struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg));
+       struct nlattr *mcgrp;
+       int i;
+
+       nla_parse(tb, CTRL_ATTR_MAX, genlmsg_attrdata(gnlh, 0),
+                 genlmsg_attrlen(gnlh, 0), NULL);
+       if (!tb[CTRL_ATTR_MCAST_GROUPS])
+               return NL_SKIP;
+
+       nla_for_each_nested(mcgrp, tb[CTRL_ATTR_MCAST_GROUPS], i) {
+               struct nlattr *tb2[CTRL_ATTR_MCAST_GRP_MAX + 1];
+               nla_parse(tb2, CTRL_ATTR_MCAST_GRP_MAX, nla_data(mcgrp),
+                         nla_len(mcgrp), NULL);
+               if (!tb2[CTRL_ATTR_MCAST_GRP_NAME] ||
+                   !tb2[CTRL_ATTR_MCAST_GRP_ID] ||
+                   strncmp(nla_data(tb2[CTRL_ATTR_MCAST_GRP_NAME]),
+                               res->group,
+                               nla_len(tb2[CTRL_ATTR_MCAST_GRP_NAME])) != 0)
+                       continue;
+               res->id = nla_get_u32(tb2[CTRL_ATTR_MCAST_GRP_ID]);
+               break;
+       };
+
+       return 0;
+}
+
+static int nl_get_multicast_id(struct hfi_event_data *drv,
+                              const char *family, const char *group)
+{
+       struct nl_msg *msg;
+       int ret = -1;
+       struct family_data res = { group, -ENOENT };
+
+       msg = nlmsg_alloc();
+       if (!msg)
+               return -ENOMEM;
+       genlmsg_put(msg, 0, 0, genl_ctrl_resolve(drv->nl_handle, "nlctrl"),
+                   0, 0, CTRL_CMD_GETFAMILY, 0);
+       NLA_PUT_STRING(msg, CTRL_ATTR_FAMILY_NAME, family);
+
+       ret = send_and_recv_msgs(drv, msg, family_handler, &res);
+       msg = NULL;
+       if (ret == 0)
+               ret = res.id;
+
+nla_put_failure:
+       nlmsg_free(msg);
+       return ret;
+}
+
+struct perf_cap {
+       int cpu;
+       int perf;
+       int eff;
+};
+
+static void process_hfi_event(struct perf_cap *perf_cap)
+{
+       process_level_change(perf_cap->cpu);
+}
+
+static int handle_event(struct nl_msg *n, void *arg)
+{
+       struct nlmsghdr *nlh = nlmsg_hdr(n);
+       struct genlmsghdr *genlhdr = genlmsg_hdr(nlh);
+       struct nlattr *attrs[THERMAL_GENL_ATTR_MAX + 1];
+       int ret;
+       struct perf_cap perf_cap;
+
+       ret = genlmsg_parse(nlh, 0, attrs, THERMAL_GENL_ATTR_MAX, NULL);
+
+       debug_printf("Received event %d parse_rer:%d\n", genlhdr->cmd, ret);
+       if (genlhdr->cmd == THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE) {
+               struct nlattr *cap;
+               int j, index = 0;
+
+               debug_printf("THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE\n");
+               nla_for_each_nested(cap, attrs[THERMAL_GENL_ATTR_CPU_CAPABILITY], j) {
+                       switch (index) {
+                       case 0:
+                               perf_cap.cpu = nla_get_u32(cap);
+                               break;
+                       case 1:
+                               perf_cap.perf = nla_get_u32(cap);
+                               break;
+                       case 2:
+                               perf_cap.eff = nla_get_u32(cap);
+                               break;
+                       default:
+                               break;
+                       }
+                       ++index;
+                       if (index == 3) {
+                               index = 0;
+                               process_hfi_event(&perf_cap);
+                       }
+               }
+       }
+
+       return 0;
+}
+
+static int _hfi_exit;
+
+static int check_hf_suport(void)
+{
+       unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
+
+       __cpuid(6, eax, ebx, ecx, edx);
+       if (eax & BIT(19))
+               return 1;
+
+       return 0;
+}
+
+int hfi_main(void)
+{
+       struct nl_sock *sock;
+       struct nl_cb *cb;
+       int err = 0;
+       int mcast_id;
+       int no_block = 0;
+
+       if (!check_hf_suport()) {
+               fprintf(stderr, "CPU Doesn't support HFI\n");
+               return -1;
+       }
+
+       sock = nl_socket_alloc();
+       if (!sock) {
+               fprintf(stderr, "nl_socket_alloc failed\n");
+               return -1;
+       }
+
+       if (genl_connect(sock)) {
+               fprintf(stderr, "genl_connect(sk_event) failed\n");
+               goto free_sock;
+       }
+
+       drv.nl_handle = sock;
+       drv.nl_cb = cb = nl_cb_alloc(NL_CB_DEFAULT);
+       if (drv.nl_cb == NULL) {
+               printf("Failed to allocate netlink callbacks");
+               goto free_sock;
+       }
+
+       mcast_id = nl_get_multicast_id(&drv, THERMAL_GENL_FAMILY_NAME,
+                                  THERMAL_GENL_EVENT_GROUP_NAME);
+       if (mcast_id < 0) {
+               fprintf(stderr, "nl_get_multicast_id failed\n");
+               goto free_sock;
+       }
+
+       if (nl_socket_add_membership(sock, mcast_id)) {
+               fprintf(stderr, "nl_socket_add_membership failed");
+               goto free_sock;
+       }
+
+       nl_cb_set(cb, NL_CB_SEQ_CHECK, NL_CB_CUSTOM, seq_check_handler, 0);
+       nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, handle_event, NULL);
+
+       if (no_block)
+               nl_socket_set_nonblocking(sock);
+
+       debug_printf("hfi is initialized\n");
+
+       while (!_hfi_exit && !err) {
+               err = nl_recvmsgs(sock, cb);
+               debug_printf("nl_recv_message err:%d\n", err);
+       }
+
+       return 0;
+
+       /* Netlink library doesn't have calls to dealloc cb or disconnect */
+free_sock:
+       nl_socket_free(sock);
+
+       return -1;
+}
+
+void hfi_exit(void)
+{
+       _hfi_exit = 1;
+}
index efe72fa..060390e 100644 (file)
@@ -15,7 +15,8 @@ struct process_cmd_struct {
        int arg;
 };
 
-static const char *version_str = "v1.11";
+static const char *version_str = "v1.12";
+
 static const int supported_api_ver = 1;
 static struct isst_if_platform_info isst_platform_info;
 static char *progname;
@@ -368,7 +369,7 @@ int get_topo_max_cpus(void)
        return topo_max_cpus;
 }
 
-static void set_cpu_online_offline(int cpu, int state)
+void set_cpu_online_offline(int cpu, int state)
 {
        char buffer[128];
        int fd, ret;
@@ -409,12 +410,10 @@ static void force_all_cpus_online(void)
        unlink("/var/run/isst_cpu_topology.dat");
 }
 
-#define MAX_PACKAGE_COUNT 8
-#define MAX_DIE_PER_PACKAGE 2
-static void for_each_online_package_in_set(void (*callback)(int, void *, void *,
-                                                           void *, void *),
-                                          void *arg1, void *arg2, void *arg3,
-                                          void *arg4)
+void for_each_online_package_in_set(void (*callback)(int, void *, void *,
+                                                    void *, void *),
+                                   void *arg1, void *arg2, void *arg3,
+                                   void *arg4)
 {
        int max_packages[MAX_PACKAGE_COUNT * MAX_PACKAGE_COUNT];
        int pkg_index = 0, i;
@@ -2803,7 +2802,9 @@ static void usage(void)
        printf("\t[-p|--pause] : Delay between two mail box commands in milliseconds\n");
        printf("\t[-r|--retry] : Retry count for mail box commands on failure, default 3\n");
        printf("\t[-v|--version] : Print version\n");
-
+       printf("\t[-b|--oob : Start a daemon to process HFI events for perf profile change from Out of Band agent.\n");
+       printf("\t[-n|--no-daemon : Don't run as daemon. By default --oob will turn on daemon mode\n");
+       printf("\t[-w|--delay : Delay for reading config level state change in OOB poll mode.\n");
        printf("\nResult format\n");
        printf("\tResult display uses a common format for each command:\n");
        printf("\tResults are formatted in text/JSON with\n");
@@ -2837,6 +2838,9 @@ static void cmdline(int argc, char **argv)
        int opt, force_cpus_online = 0;
        int option_index = 0;
        int ret;
+       int oob_mode = 0;
+       int poll_interval = -1;
+       int no_daemon = 0;
 
        static struct option long_options[] = {
                { "all-cpus-online", no_argument, 0, 'a' },
@@ -2849,6 +2853,9 @@ static void cmdline(int argc, char **argv)
                { "out", required_argument, 0, 'o' },
                { "retry", required_argument, 0, 'r' },
                { "version", no_argument, 0, 'v' },
+               { "oob", no_argument, 0, 'b' },
+               { "no-daemon", no_argument, 0, 'n' },
+               { "poll-interval", required_argument, 0, 'w' },
                { 0, 0, 0, 0 }
        };
 
@@ -2875,7 +2882,7 @@ static void cmdline(int argc, char **argv)
        }
 
        progname = argv[0];
-       while ((opt = getopt_long_only(argc, argv, "+c:df:hio:va", long_options,
+       while ((opt = getopt_long_only(argc, argv, "+c:df:hio:vabw:n", long_options,
                                       &option_index)) != -1) {
                switch (opt) {
                case 'a':
@@ -2920,12 +2927,26 @@ static void cmdline(int argc, char **argv)
                case 'v':
                        print_version();
                        break;
+               case 'b':
+                       oob_mode = 1;
+                       break;
+               case 'n':
+                       no_daemon = 1;
+                       break;
+               case 'w':
+                       ret = strtol(optarg, &ptr, 10);
+                       if (!ret) {
+                               fprintf(stderr, "Invalid poll interval count\n");
+                               exit(0);
+                       }
+                       poll_interval = ret;
+                       break;
                default:
                        usage();
                }
        }
 
-       if (optind > (argc - 2)) {
+       if (optind > (argc - 2) && !oob_mode) {
                usage();
                exit(0);
        }
@@ -2936,6 +2957,17 @@ static void cmdline(int argc, char **argv)
        set_cpu_present_cpu_mask();
        set_cpu_target_cpu_mask();
 
+       if (oob_mode) {
+               create_cpu_map();
+               if (debug_flag)
+                       fprintf(stderr, "OOB mode is enabled in debug mode\n");
+
+               ret = isst_daemon(debug_flag, poll_interval, no_daemon);
+               if (ret)
+                       fprintf(stderr, "OOB mode enable failed\n");
+               goto out;
+       }
+
        if (!is_clx_n_platform()) {
                ret = isst_fill_platform_info();
                if (ret)
diff --git a/tools/power/x86/intel-speed-select/isst-daemon.c b/tools/power/x86/intel-speed-select/isst-daemon.c
new file mode 100644 (file)
index 0000000..dd37292
--- /dev/null
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Speed Select -- Allow speed select to daemonize
+ * Copyright (c) 2022 Intel Corporation.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <getopt.h>
+#include <signal.h>
+#include <time.h>
+
+#include "isst.h"
+
+static int per_package_levels_info[MAX_PACKAGE_COUNT][MAX_DIE_PER_PACKAGE];
+static time_t per_package_levels_tm[MAX_PACKAGE_COUNT][MAX_DIE_PER_PACKAGE];
+
+static void init_levels(void)
+{
+       int i, j;
+
+       for (i = 0; i < MAX_PACKAGE_COUNT; ++i)
+               for (j = 0; j < MAX_DIE_PER_PACKAGE; ++j)
+                       per_package_levels_info[i][j] = -1;
+}
+
+void process_level_change(int cpu)
+{
+       struct isst_pkg_ctdp_level_info ctdp_level;
+       int pkg_id = get_physical_package_id(cpu);
+       int die_id = get_physical_die_id(cpu);
+       struct isst_pkg_ctdp pkg_dev;
+       time_t tm;
+       int ret;
+
+       if (pkg_id >= MAX_PACKAGE_COUNT || die_id > MAX_DIE_PER_PACKAGE) {
+               debug_printf("Invalid package/die info for cpu:%d\n", cpu);
+               return;
+       }
+
+       tm = time(NULL);
+       if (tm - per_package_levels_tm[pkg_id][die_id] < 2 )
+               return;
+
+       per_package_levels_tm[pkg_id][die_id] = tm;
+
+       ret = isst_get_ctdp_levels(cpu, &pkg_dev);
+       if (ret) {
+               debug_printf("Can't get tdp levels for cpu:%d\n", cpu);
+               return;
+       }
+
+       debug_printf("Get Config level %d pkg:%d die:%d current_level:%d \n", cpu,
+                     pkg_id, die_id, pkg_dev.current_level);
+
+       if (pkg_dev.locked) {
+               debug_printf("config TDP s locked \n");
+               return;
+       }
+
+       if (per_package_levels_info[pkg_id][die_id] == pkg_dev.current_level)
+               return;
+
+       debug_printf("**Config level change for cpu:%d pkg:%d die:%d from %d to %d\n",
+                     cpu, pkg_id, die_id, per_package_levels_info[pkg_id][die_id],
+                     pkg_dev.current_level);
+
+       per_package_levels_info[pkg_id][die_id] = pkg_dev.current_level;
+
+       ctdp_level.core_cpumask_size =
+               alloc_cpu_set(&ctdp_level.core_cpumask);
+       ret = isst_get_coremask_info(cpu, pkg_dev.current_level, &ctdp_level);
+       if (ret) {
+               free_cpu_set(ctdp_level.core_cpumask);
+               debug_printf("Can't get core_mask:%d\n", cpu);
+               return;
+       }
+
+       if (ctdp_level.cpu_count) {
+               int i, max_cpus = get_topo_max_cpus();
+               for (i = 0; i < max_cpus; ++i) {
+                       if (pkg_id != get_physical_package_id(i) || die_id != get_physical_die_id(i))
+                               continue;
+                       if (CPU_ISSET_S(i, ctdp_level.core_cpumask_size, ctdp_level.core_cpumask)) {
+                               fprintf(stderr, "online cpu %d\n", i);
+                               set_cpu_online_offline(i, 1);
+                       } else {
+                               fprintf(stderr, "offline cpu %d\n", i);
+                               set_cpu_online_offline(i, 0);
+                       }
+               }
+       }
+
+       free_cpu_set(ctdp_level.core_cpumask);
+}
+
+static void _poll_for_config_change(int cpu, void *arg1, void *arg2,
+                                   void *arg3, void *arg4)
+{
+       process_level_change(cpu);
+}
+
+static void poll_for_config_change(void)
+{
+       for_each_online_package_in_set(_poll_for_config_change, NULL, NULL,
+                                      NULL, NULL);
+}
+
+static int done = 0;
+static int pid_file_handle;
+
+static void signal_handler(int sig)
+{
+       switch (sig) {
+       case SIGINT:
+       case SIGTERM:
+               done = 1;
+               hfi_exit();
+               exit(0);
+               break;
+       default:
+               break;
+       }
+}
+
+static void daemonize(char *rundir, char *pidfile)
+{
+       int pid, sid, i;
+       char str[10];
+       struct sigaction sig_actions;
+       sigset_t sig_set;
+       int ret;
+
+       if (getppid() == 1)
+               return;
+
+       sigemptyset(&sig_set);
+       sigaddset(&sig_set, SIGCHLD);
+       sigaddset(&sig_set, SIGTSTP);
+       sigaddset(&sig_set, SIGTTOU);
+       sigaddset(&sig_set, SIGTTIN);
+       sigprocmask(SIG_BLOCK, &sig_set, NULL);
+
+       sig_actions.sa_handler = signal_handler;
+       sigemptyset(&sig_actions.sa_mask);
+       sig_actions.sa_flags = 0;
+
+       sigaction(SIGHUP, &sig_actions, NULL);
+       sigaction(SIGTERM, &sig_actions, NULL);
+       sigaction(SIGINT, &sig_actions, NULL);
+
+       pid = fork();
+       if (pid < 0) {
+               /* Could not fork */
+               exit(EXIT_FAILURE);
+       }
+       if (pid > 0)
+               exit(EXIT_SUCCESS);
+
+       umask(027);
+
+       sid = setsid();
+       if (sid < 0)
+               exit(EXIT_FAILURE);
+
+       /* close all descriptors */
+       for (i = getdtablesize(); i >= 0; --i)
+               close(i);
+
+       i = open("/dev/null", O_RDWR);
+       ret = dup(i);
+       if (ret == -1)
+               exit(EXIT_FAILURE);
+
+       ret = dup(i);
+       if (ret == -1)
+               exit(EXIT_FAILURE);
+
+       ret = chdir(rundir);
+       if (ret == -1)
+               exit(EXIT_FAILURE);
+
+       pid_file_handle = open(pidfile, O_RDWR | O_CREAT, 0600);
+       if (pid_file_handle == -1) {
+               /* Couldn't open lock file */
+               exit(1);
+       }
+       /* Try to lock file */
+#ifdef LOCKF_SUPPORT
+       if (lockf(pid_file_handle, F_TLOCK, 0) == -1) {
+#else
+       if (flock(pid_file_handle, LOCK_EX|LOCK_NB) < 0) {
+#endif
+               /* Couldn't get lock on lock file */
+               fprintf(stderr, "Couldn't get lock file %d\n", getpid());
+               exit(1);
+       }
+       snprintf(str, sizeof(str), "%d\n", getpid());
+       ret = write(pid_file_handle, str, strlen(str));
+       if (ret == -1)
+               exit(EXIT_FAILURE);
+
+       close(i);
+}
+
+int isst_daemon(int debug_mode, int poll_interval, int no_daemon)
+{
+       int ret;
+
+       if (!no_daemon && poll_interval < 0 && !debug_mode) {
+               fprintf(stderr, "OOB mode is enabled and will run as daemon\n");
+               daemonize((char *) "/tmp/",
+                               (char *)"/tmp/hfi-events.pid");
+       } else {
+               signal(SIGINT, signal_handler);
+       }
+
+       init_levels();
+
+       if (poll_interval < 0) {
+               ret = hfi_main();
+               if (ret) {
+                       fprintf(stderr, "HFI initialization failed\n");
+               }
+               fprintf(stderr, "Must specify poll-interval\n");
+               return ret;
+       }
+
+       debug_printf("Starting loop\n");
+       while (!done) {
+               sleep(poll_interval);
+               poll_for_config_change();
+       }
+
+       return 0;
+}
index 1aa15d5..0796d8c 100644 (file)
@@ -76,6 +76,9 @@
 
 #define DISP_FREQ_MULTIPLIER 100
 
+#define MAX_PACKAGE_COUNT 8
+#define MAX_DIE_PER_PACKAGE 2
+
 struct isst_clos_config {
        int pkg_id;
        int die_id;
@@ -260,4 +263,14 @@ extern int is_skx_based_platform(void);
 extern int is_spr_platform(void);
 extern int is_icx_platform(void);
 extern void isst_trl_display_information(int cpu, FILE *outf, unsigned long long trl);
+
+extern void set_cpu_online_offline(int cpu, int state);
+extern void for_each_online_package_in_set(void (*callback)(int, void *, void *,
+                                                           void *, void *),
+                                          void *arg1, void *arg2, void *arg3,
+                                          void *arg4);
+extern int isst_daemon(int debug_mode, int poll_interval, int no_daemon);
+extern void process_level_change(int cpu);
+extern int hfi_main(void);
+extern void hfi_exit(void);
 #endif