#include <net/netfilter/nf_conntrack.h>
#endif
#include <net/net_namespace.h> /* Netw namespace */
+#include <linux/sched/isolation.h>
#define IP_VS_HDR_INVERSE 1
#define IP_VS_HDR_ICMP 2
struct u64_stats_sync syncp;
};
+/* Default nice for estimator kthreads */
+#define IPVS_EST_NICE 0
+
/* IPVS statistics objects */
struct ip_vs_estimator {
struct hlist_node list;
int sysctl_schedule_icmp;
int sysctl_ignore_tunneled;
int sysctl_run_estimation;
+#ifdef CONFIG_SYSCTL
+ cpumask_var_t sysctl_est_cpulist; /* kthread cpumask */
+ int est_cpulist_valid; /* cpulist set */
+ int sysctl_est_nice; /* kthread nice */
+ int est_stopped; /* stop tasks */
+#endif
/* ip_vs_lblc */
int sysctl_lblc_expiration;
return ipvs->sysctl_run_estimation;
}
+static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+{
+ if (ipvs->est_cpulist_valid)
+ return ipvs->sysctl_est_cpulist;
+ else
+ return housekeeping_cpumask(HK_TYPE_KTHREAD);
+}
+
+static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+{
+ return ipvs->sysctl_est_nice;
+}
+
#else
static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
return 1;
}
+static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+{
+ return housekeeping_cpumask(HK_TYPE_KTHREAD);
+}
+
+static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+{
+ return IPVS_EST_NICE;
+}
+
#endif
/* IPVS core functions
struct ip_vs_est_kt_data *kd);
void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd);
+static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs)
+{
+#ifdef CONFIG_SYSCTL
+ ipvs->est_stopped = ipvs->est_cpulist_valid &&
+ cpumask_empty(sysctl_est_cpulist(ipvs));
+#endif
+}
+
+static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs)
+{
+#ifdef CONFIG_SYSCTL
+ return ipvs->est_stopped;
+#else
+ return false;
+#endif
+}
+
+static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs)
+{
+ unsigned int limit = IPVS_EST_CPU_KTHREADS *
+ cpumask_weight(sysctl_est_cpulist(ipvs));
+
+ return max(1U, limit);
+}
+
/* Various IPVS packet transmitters (from ip_vs_xmit.c) */
int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
/* New config ? Stop kthread tasks */
if (genid != genid_done)
ip_vs_est_kthread_stop(kd);
- if (!kd->task) {
+ if (!kd->task && !ip_vs_est_stopped(ipvs)) {
/* Do not start kthreads above 0 in calc phase */
if ((!id || !ipvs->est_calc_phase) &&
ip_vs_est_kthread_start(ipvs, kd) < 0)
return rc;
}
+static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ cpumask_var_t *valp = table->data;
+ cpumask_var_t newmask;
+ int ret;
+
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpulist_parse(buffer, newmask);
+ if (ret)
+ goto out;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ if (!ipvs->est_cpulist_valid) {
+ if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ ipvs->est_cpulist_valid = 1;
+ }
+ cpumask_and(newmask, newmask, ¤t->cpus_mask);
+ cpumask_copy(*valp, newmask);
+ /* est_max_threads may depend on cpulist size */
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+ ipvs->est_calc_phase = 1;
+ ip_vs_est_reload_start(ipvs);
+
+unlock:
+ mutex_unlock(&ipvs->est_mutex);
+
+out:
+ free_cpumask_var(newmask);
+ return ret;
+}
+
+static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer,
+ size_t size)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ cpumask_var_t *valp = table->data;
+ struct cpumask *mask;
+ int ret;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ if (ipvs->est_cpulist_valid)
+ mask = *valp;
+ else
+ mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
+ ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+
+ mutex_unlock(&ipvs->est_mutex);
+
+ return ret;
+}
+
+static int ipvs_proc_est_cpulist(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ /* Ignore both read and write(append) if *ppos not 0 */
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write) {
+ /* proc_sys_call_handler() appends terminator */
+ ret = ipvs_proc_est_cpumask_set(table, buffer);
+ if (ret >= 0)
+ *ppos += *lenp;
+ } else {
+ /* proc_sys_call_handler() allocates 1 byte for terminator */
+ ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
+ if (ret >= 0) {
+ *lenp = ret;
+ *ppos += *lenp;
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+static int ipvs_proc_est_nice(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ int *valp = table->data;
+ int val = *valp;
+ int ret;
+
+ struct ctl_table tmp_table = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (write && ret >= 0) {
+ if (val < MIN_NICE || val > MAX_NICE) {
+ ret = -EINVAL;
+ } else {
+ mutex_lock(&ipvs->est_mutex);
+ if (*valp != val) {
+ *valp = val;
+ ip_vs_est_reload_start(ipvs);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+ }
+ }
+ return ret;
+}
+
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
* Do not change order or insert new entries without
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "est_cpulist",
+ .maxlen = NR_CPUS, /* unused */
+ .mode = 0644,
+ .proc_handler = ipvs_proc_est_cpulist,
+ },
+ {
+ .procname = "est_nice",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipvs_proc_est_nice,
+ },
#ifdef CONFIG_IP_VS_DEBUG
{
.procname = "debug_level",
INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
expire_nodest_conn_handler);
+ ipvs->est_stopped = 0;
if (!net_eq(net, &init_net)) {
tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
ipvs->sysctl_run_estimation = 1;
tbl[idx++].data = &ipvs->sysctl_run_estimation;
+
+ ipvs->est_cpulist_valid = 0;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_est_cpulist;
+
+ ipvs->sysctl_est_nice = IPVS_EST_NICE;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_est_nice;
+
#ifdef CONFIG_IP_VS_DEBUG
/* Global sysctls must be ro in non-init netns */
if (!net_eq(net, &init_net))
unregister_net_sysctl_table(ipvs->sysctl_hdr);
ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+ if (ipvs->est_cpulist_valid)
+ free_cpumask_var(ipvs->sysctl_est_cpulist);
+
if (!net_eq(net, &init_net))
kfree(ipvs->sysctl_tbl);
}
- kthread contexts are created and attached to array
- the kthread tasks are started when first service is added, before that
the total stats are not estimated
+ - when configuration (cpulist/nice) is changed, the tasks are restarted
+ by work (est_reload_work)
+ - kthread tasks are stopped while the cpulist is empty
- the kthread context holds lists with estimators (chains) which are
processed every 2 seconds
- as estimators can be added dynamically and in bursts, we try to spread
/* Ignore reloads before first service is added */
if (!ipvs->enable)
return;
+ ip_vs_est_stopped_recalc(ipvs);
/* Bump the kthread configuration genid */
atomic_inc(&ipvs->est_genid);
queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
goto out;
}
+ set_user_nice(kd->task, sysctl_est_nice(ipvs));
+ set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
+
pr_info("starting estimator thread %d...\n", kd->id);
wake_up_process(kd->task);
}
/* Start kthread tasks only when services are present */
- if (ipvs->enable) {
+ if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
ret = ip_vs_est_kthread_start(ipvs, kd);
if (ret < 0)
goto out;
int ret;
if (!ipvs->est_max_threads && ipvs->enable)
- ipvs->est_max_threads = IPVS_EST_CPU_KTHREADS *
- num_possible_cpus();
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
est->ktid = -1;
est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */