drivers/infiniband/hw/hfi1/affinity.c

   1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2 /*
   3  * Copyright(c) 2015 - 2020 Intel Corporation.
   4  */
   5
   6 #include <linux/topology.h>
   7 #include <linux/cpumask.h>
   8 #include <linux/interrupt.h>
   9 #include <linux/numa.h>
  10
  11 #include "hfi.h"
  12 #include "affinity.h"
  13 #include "sdma.h"
  14 #include "trace.h"
  15
  16 struct hfi1_affinity_node_list node_affinity = {
  17         .list = LIST_HEAD_INIT(node_affinity.list),
  18         .lock = __MUTEX_INITIALIZER(node_affinity.lock)
  19 };
  20
  21 /* Name of IRQ types, indexed by enum irq_type */
  22 static const char * const irq_type_names[] = {
  23         "SDMA",
  24         "RCVCTXT",
  25         "NETDEVCTXT",
  26         "GENERAL",
  27         "OTHER",
  28 };
  29
  30 /* Per NUMA node count of HFI devices */
  31 static unsigned int *hfi1_per_node_cntr;
  32
  33 static inline void init_cpu_mask_set(struct cpu_mask_set *set)
  34 {
  35         cpumask_clear(&set->mask);
  36         cpumask_clear(&set->used);
  37         set->gen = 0;
  38 }
  39
  40 /* Increment generation of CPU set if needed */
  41 static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set)
  42 {
  43         if (cpumask_equal(&set->mask, &set->used)) {
  44                 /*
  45                  * We've used up all the CPUs, bump up the generation
  46                  * and reset the 'used' map
  47                  */
  48                 set->gen++;
  49                 cpumask_clear(&set->used);
  50         }
  51 }
  52
  53 static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set)
  54 {
  55         if (cpumask_empty(&set->used) && set->gen) {
  56                 set->gen--;
  57                 cpumask_copy(&set->used, &set->mask);
  58         }
  59 }
  60
  61 /* Get the first CPU from the list of unused CPUs in a CPU set data structure */
  62 static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff)
  63 {
  64         int cpu;
  65
  66         if (!diff || !set)
  67                 return -EINVAL;
  68
  69         _cpu_mask_set_gen_inc(set);
  70
  71         /* Find out CPUs left in CPU mask */
  72         cpumask_andnot(diff, &set->mask, &set->used);
  73
  74         cpu = cpumask_first(diff);
  75         if (cpu >= nr_cpu_ids) /* empty */
  76                 cpu = -EINVAL;
  77         else
  78                 cpumask_set_cpu(cpu, &set->used);
  79
  80         return cpu;
  81 }
  82
  83 static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu)
  84 {
  85         if (!set)
  86                 return;
  87
  88         cpumask_clear_cpu(cpu, &set->used);
  89         _cpu_mask_set_gen_dec(set);
  90 }
  91
  92 /* Initialize non-HT cpu cores mask */
  93 void init_real_cpu_mask(void)
  94 {
  95         int possible, curr_cpu, i, ht;
  96
  97         cpumask_clear(&node_affinity.real_cpu_mask);
  98
  99         /* Start with cpu online mask as the real cpu mask */
 100         cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
 101
 102         /*
 103          * Remove HT cores from the real cpu mask.  Do this in two steps below.
 104          */
 105         possible = cpumask_weight(&node_affinity.real_cpu_mask);
 106         ht = cpumask_weight(topology_sibling_cpumask(
 107                                 cpumask_first(&node_affinity.real_cpu_mask)));
 108         /*
 109          * Step 1.  Skip over the first N HT siblings and use them as the
 110          * "real" cores.  Assumes that HT cores are not enumerated in
 111          * succession (except in the single core case).
 112          */
 113         curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
 114         for (i = 0; i < possible / ht; i++)
 115                 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 116         /*
 117          * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
 118          * skip any gaps.
 119          */
 120         for (; i < possible; i++) {
 121                 cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
 122                 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 123         }
 124 }
 125
 126 int node_affinity_init(void)
 127 {
 128         int node;
 129         struct pci_dev *dev = NULL;
 130         const struct pci_device_id *ids = hfi1_pci_tbl;
 131
 132         cpumask_clear(&node_affinity.proc.used);
 133         cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
 134
 135         node_affinity.proc.gen = 0;
 136         node_affinity.num_core_siblings =
 137                                 cpumask_weight(topology_sibling_cpumask(
 138                                         cpumask_first(&node_affinity.proc.mask)
 139                                         ));
 140         node_affinity.num_possible_nodes = num_possible_nodes();
 141         node_affinity.num_online_nodes = num_online_nodes();
 142         node_affinity.num_online_cpus = num_online_cpus();
 143
 144         /*
 145          * The real cpu mask is part of the affinity struct but it has to be
 146          * initialized early. It is needed to calculate the number of user
 147          * contexts in set_up_context_variables().
 148          */
 149         init_real_cpu_mask();
 150
 151         hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
 152                                      sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
 153         if (!hfi1_per_node_cntr)
 154                 return -ENOMEM;
 155
 156         while (ids->vendor) {
 157                 dev = NULL;
 158                 while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
 159                         node = pcibus_to_node(dev->bus);
 160                         if (node < 0)
 161                                 goto out;
 162
 163                         hfi1_per_node_cntr[node]++;
 164                 }
 165                 ids++;
 166         }
 167
 168         return 0;
 169
 170 out:
 171         /*
 172          * Invalid PCI NUMA node information found, note it, and populate
 173          * our database 1:1.
 174          */
 175         pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n");
 176         pr_err("HFI: System BIOS may need to be upgraded\n");
 177         for (node = 0; node < node_affinity.num_possible_nodes; node++)
 178                 hfi1_per_node_cntr[node] = 1;
 179
 180         pci_dev_put(dev);
 181
 182         return 0;
 183 }
 184
 185 static void node_affinity_destroy(struct hfi1_affinity_node *entry)
 186 {
 187         free_percpu(entry->comp_vect_affinity);
 188         kfree(entry);
 189 }
 190
 191 void node_affinity_destroy_all(void)
 192 {
 193         struct list_head *pos, *q;
 194         struct hfi1_affinity_node *entry;
 195
 196         mutex_lock(&node_affinity.lock);
 197         list_for_each_safe(pos, q, &node_affinity.list) {
 198                 entry = list_entry(pos, struct hfi1_affinity_node,
 199                                    list);
 200                 list_del(pos);
 201                 node_affinity_destroy(entry);
 202         }
 203         mutex_unlock(&node_affinity.lock);
 204         kfree(hfi1_per_node_cntr);
 205 }
 206
 207 static struct hfi1_affinity_node *node_affinity_allocate(int node)
 208 {
 209         struct hfi1_affinity_node *entry;
 210
 211         entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 212         if (!entry)
 213                 return NULL;
 214         entry->node = node;
 215         entry->comp_vect_affinity = alloc_percpu(u16);
 216         INIT_LIST_HEAD(&entry->list);
 217
 218         return entry;
 219 }
 220
 221 /*
 222  * It appends an entry to the list.
 223  * It *must* be called with node_affinity.lock held.
 224  */
 225 static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
 226 {
 227         list_add_tail(&entry->list, &node_affinity.list);
 228 }
 229
 230 /* It must be called with node_affinity.lock held */
 231 static struct hfi1_affinity_node *node_affinity_lookup(int node)
 232 {
 233         struct hfi1_affinity_node *entry;
 234
 235         list_for_each_entry(entry, &node_affinity.list, list) {
 236                 if (entry->node == node)
 237                         return entry;
 238         }
 239
 240         return NULL;
 241 }
 242
 243 static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
 244                                 u16 __percpu *comp_vect_affinity)
 245 {
 246         int curr_cpu;
 247         u16 cntr;
 248         u16 prev_cntr;
 249         int ret_cpu;
 250
 251         if (!possible_cpumask) {
 252                 ret_cpu = -EINVAL;
 253                 goto fail;
 254         }
 255
 256         if (!comp_vect_affinity) {
 257                 ret_cpu = -EINVAL;
 258                 goto fail;
 259         }
 260
 261         ret_cpu = cpumask_first(possible_cpumask);
 262         if (ret_cpu >= nr_cpu_ids) {
 263                 ret_cpu = -EINVAL;
 264                 goto fail;
 265         }
 266
 267         prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
 268         for_each_cpu(curr_cpu, possible_cpumask) {
 269                 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
 270
 271                 if (cntr < prev_cntr) {
 272                         ret_cpu = curr_cpu;
 273                         prev_cntr = cntr;
 274                 }
 275         }
 276
 277         *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
 278
 279 fail:
 280         return ret_cpu;
 281 }
 282
 283 static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
 284                                     u16 __percpu *comp_vect_affinity)
 285 {
 286         int curr_cpu;
 287         int max_cpu;
 288         u16 cntr;
 289         u16 prev_cntr;
 290
 291         if (!possible_cpumask)
 292                 return -EINVAL;
 293
 294         if (!comp_vect_affinity)
 295                 return -EINVAL;
 296
 297         max_cpu = cpumask_first(possible_cpumask);
 298         if (max_cpu >= nr_cpu_ids)
 299                 return -EINVAL;
 300
 301         prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
 302         for_each_cpu(curr_cpu, possible_cpumask) {
 303                 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
 304
 305                 if (cntr > prev_cntr) {
 306                         max_cpu = curr_cpu;
 307                         prev_cntr = cntr;
 308                 }
 309         }
 310
 311         *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
 312
 313         return max_cpu;
 314 }
 315
 316 /*
 317  * Non-interrupt CPUs are used first, then interrupt CPUs.
 318  * Two already allocated cpu masks must be passed.
 319  */
 320 static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
 321                                   struct hfi1_affinity_node *entry,
 322                                   cpumask_var_t non_intr_cpus,
 323                                   cpumask_var_t available_cpus)
 324         __must_hold(&node_affinity.lock)
 325 {
 326         int cpu;
 327         struct cpu_mask_set *set = dd->comp_vect;
 328
 329         lockdep_assert_held(&node_affinity.lock);
 330         if (!non_intr_cpus) {
 331                 cpu = -1;
 332                 goto fail;
 333         }
 334
 335         if (!available_cpus) {
 336                 cpu = -1;
 337                 goto fail;
 338         }
 339
 340         /* Available CPUs for pinning completion vectors */
 341         _cpu_mask_set_gen_inc(set);
 342         cpumask_andnot(available_cpus, &set->mask, &set->used);
 343
 344         /* Available CPUs without SDMA engine interrupts */
 345         cpumask_andnot(non_intr_cpus, available_cpus,
 346                        &entry->def_intr.used);
 347
 348         /* If there are non-interrupt CPUs available, use them first */
 349         if (!cpumask_empty(non_intr_cpus))
 350                 cpu = cpumask_first(non_intr_cpus);
 351         else /* Otherwise, use interrupt CPUs */
 352                 cpu = cpumask_first(available_cpus);
 353
 354         if (cpu >= nr_cpu_ids) { /* empty */
 355                 cpu = -1;
 356                 goto fail;
 357         }
 358         cpumask_set_cpu(cpu, &set->used);
 359
 360 fail:
 361         return cpu;
 362 }
 363
 364 static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
 365 {
 366         struct cpu_mask_set *set = dd->comp_vect;
 367
 368         if (cpu < 0)
 369                 return;
 370
 371         cpu_mask_set_put(set, cpu);
 372 }
 373
 374 /* _dev_comp_vect_mappings_destroy() is reentrant */
 375 static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
 376 {
 377         int i, cpu;
 378
 379         if (!dd->comp_vect_mappings)
 380                 return;
 381
 382         for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
 383                 cpu = dd->comp_vect_mappings[i];
 384                 _dev_comp_vect_cpu_put(dd, cpu);
 385                 dd->comp_vect_mappings[i] = -1;
 386                 hfi1_cdbg(AFFINITY,
 387                           "[%s] Release CPU %d from completion vector %d",
 388                           rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
 389         }
 390
 391         kfree(dd->comp_vect_mappings);
 392         dd->comp_vect_mappings = NULL;
 393 }
 394
 395 /*
 396  * This function creates the table for looking up CPUs for completion vectors.
 397  * num_comp_vectors needs to have been initilized before calling this function.
 398  */
 399 static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
 400                                           struct hfi1_affinity_node *entry)
 401         __must_hold(&node_affinity.lock)
 402 {
 403         int i, cpu, ret;
 404         cpumask_var_t non_intr_cpus;
 405         cpumask_var_t available_cpus;
 406
 407         lockdep_assert_held(&node_affinity.lock);
 408
 409         if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
 410                 return -ENOMEM;
 411
 412         if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
 413                 free_cpumask_var(non_intr_cpus);
 414                 return -ENOMEM;
 415         }
 416
 417         dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus,
 418                                          sizeof(*dd->comp_vect_mappings),
 419                                          GFP_KERNEL);
 420         if (!dd->comp_vect_mappings) {
 421                 ret = -ENOMEM;
 422                 goto fail;
 423         }
 424         for (i = 0; i < dd->comp_vect_possible_cpus; i++)
 425                 dd->comp_vect_mappings[i] = -1;
 426
 427         for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
 428                 cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
 429                                              available_cpus);
 430                 if (cpu < 0) {
 431                         ret = -EINVAL;
 432                         goto fail;
 433                 }
 434
 435                 dd->comp_vect_mappings[i] = cpu;
 436                 hfi1_cdbg(AFFINITY,
 437                           "[%s] Completion Vector %d -> CPU %d",
 438                           rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
 439         }
 440
 441         free_cpumask_var(available_cpus);
 442         free_cpumask_var(non_intr_cpus);
 443         return 0;
 444
 445 fail:
 446         free_cpumask_var(available_cpus);
 447         free_cpumask_var(non_intr_cpus);
 448         _dev_comp_vect_mappings_destroy(dd);
 449
 450         return ret;
 451 }
 452
 453 int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
 454 {
 455         int ret;
 456         struct hfi1_affinity_node *entry;
 457
 458         mutex_lock(&node_affinity.lock);
 459         entry = node_affinity_lookup(dd->node);
 460         if (!entry) {
 461                 ret = -EINVAL;
 462                 goto unlock;
 463         }
 464         ret = _dev_comp_vect_mappings_create(dd, entry);
 465 unlock:
 466         mutex_unlock(&node_affinity.lock);
 467
 468         return ret;
 469 }
 470
 471 void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
 472 {
 473         _dev_comp_vect_mappings_destroy(dd);
 474 }
 475
 476 int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
 477 {
 478         struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
 479         struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
 480
 481         if (!dd->comp_vect_mappings)
 482                 return -EINVAL;
 483         if (comp_vect >= dd->comp_vect_possible_cpus)
 484                 return -EINVAL;
 485
 486         return dd->comp_vect_mappings[comp_vect];
 487 }
 488
 489 /*
 490  * It assumes dd->comp_vect_possible_cpus is available.
 491  */
 492 static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
 493                                         struct hfi1_affinity_node *entry,
 494                                         bool first_dev_init)
 495         __must_hold(&node_affinity.lock)
 496 {
 497         int i, j, curr_cpu;
 498         int possible_cpus_comp_vect = 0;
 499         struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
 500
 501         lockdep_assert_held(&node_affinity.lock);
 502         /*
 503          * If there's only one CPU available for completion vectors, then
 504          * there will only be one completion vector available. Othewise,
 505          * the number of completion vector available will be the number of
 506          * available CPUs divide it by the number of devices in the
 507          * local NUMA node.
 508          */
 509         if (cpumask_weight(&entry->comp_vect_mask) == 1) {
 510                 possible_cpus_comp_vect = 1;
 511                 dd_dev_warn(dd,
 512                             "Number of kernel receive queues is too large for completion vector affinity to be effective\n");
 513         } else {
 514                 possible_cpus_comp_vect +=
 515                         cpumask_weight(&entry->comp_vect_mask) /
 516                                        hfi1_per_node_cntr[dd->node];
 517
 518                 /*
 519                  * If the completion vector CPUs available doesn't divide
 520                  * evenly among devices, then the first device device to be
 521                  * initialized gets an extra CPU.
 522                  */
 523                 if (first_dev_init &&
 524                     cpumask_weight(&entry->comp_vect_mask) %
 525                     hfi1_per_node_cntr[dd->node] != 0)
 526                         possible_cpus_comp_vect++;
 527         }
 528
 529         dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
 530
 531         /* Reserving CPUs for device completion vector */
 532         for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
 533                 curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
 534                                                 entry->comp_vect_affinity);
 535                 if (curr_cpu < 0)
 536                         goto fail;
 537
 538                 cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
 539         }
 540
 541         hfi1_cdbg(AFFINITY,
 542                   "[%s] Completion vector affinity CPU set(s) %*pbl",
 543                   rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
 544                   cpumask_pr_args(dev_comp_vect_mask));
 545
 546         return 0;
 547
 548 fail:
 549         for (j = 0; j < i; j++)
 550                 per_cpu_affinity_put_max(&entry->comp_vect_mask,
 551                                          entry->comp_vect_affinity);
 552
 553         return curr_cpu;
 554 }
 555
 556 /*
 557  * It assumes dd->comp_vect_possible_cpus is available.
 558  */
 559 static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
 560                                              struct hfi1_affinity_node *entry)
 561         __must_hold(&node_affinity.lock)
 562 {
 563         int i, cpu;
 564
 565         lockdep_assert_held(&node_affinity.lock);
 566         if (!dd->comp_vect_possible_cpus)
 567                 return;
 568
 569         for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
 570                 cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
 571                                                entry->comp_vect_affinity);
 572                 /* Clearing CPU in device completion vector cpu mask */
 573                 if (cpu >= 0)
 574                         cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
 575         }
 576
 577         dd->comp_vect_possible_cpus = 0;
 578 }
 579
 580 /*
 581  * Interrupt affinity.
 582  *
 583  * non-rcv avail gets a default mask that
 584  * starts as possible cpus with threads reset
 585  * and each rcv avail reset.
 586  *
 587  * rcv avail gets node relative 1 wrapping back
 588  * to the node relative 1 as necessary.
 589  *
 590  */
 591 int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 592 {
 593         struct hfi1_affinity_node *entry;
 594         const struct cpumask *local_mask;
 595         int curr_cpu, possible, i, ret;
 596         bool new_entry = false;
 597
 598         local_mask = cpumask_of_node(dd->node);
 599         if (cpumask_first(local_mask) >= nr_cpu_ids)
 600                 local_mask = topology_core_cpumask(0);
 601
 602         mutex_lock(&node_affinity.lock);
 603         entry = node_affinity_lookup(dd->node);
 604
 605         /*
 606          * If this is the first time this NUMA node's affinity is used,
 607          * create an entry in the global affinity structure and initialize it.
 608          */
 609         if (!entry) {
 610                 entry = node_affinity_allocate(dd->node);
 611                 if (!entry) {
 612                         dd_dev_err(dd,
 613                                    "Unable to allocate global affinity node\n");
 614                         ret = -ENOMEM;
 615                         goto fail;
 616                 }
 617                 new_entry = true;
 618
 619                 init_cpu_mask_set(&entry->def_intr);
 620                 init_cpu_mask_set(&entry->rcv_intr);
 621                 cpumask_clear(&entry->comp_vect_mask);
 622                 cpumask_clear(&entry->general_intr_mask);
 623                 /* Use the "real" cpu mask of this node as the default */
 624                 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
 625                             local_mask);
 626
 627                 /* fill in the receive list */
 628                 possible = cpumask_weight(&entry->def_intr.mask);
 629                 curr_cpu = cpumask_first(&entry->def_intr.mask);
 630
 631                 if (possible == 1) {
 632                         /* only one CPU, everyone will use it */
 633                         cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
 634                         cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
 635                 } else {
 636                         /*
 637                          * The general/control context will be the first CPU in
 638                          * the default list, so it is removed from the default
 639                          * list and added to the general interrupt list.
 640                          */
 641                         cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
 642                         cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
 643                         curr_cpu = cpumask_next(curr_cpu,
 644                                                 &entry->def_intr.mask);
 645
 646                         /*
 647                          * Remove the remaining kernel receive queues from
 648                          * the default list and add them to the receive list.
 649                          */
 650                         for (i = 0;
 651                              i < (dd->n_krcv_queues - 1) *
 652                                   hfi1_per_node_cntr[dd->node];
 653                              i++) {
 654                                 cpumask_clear_cpu(curr_cpu,
 655                                                   &entry->def_intr.mask);
 656                                 cpumask_set_cpu(curr_cpu,
 657                                                 &entry->rcv_intr.mask);
 658                                 curr_cpu = cpumask_next(curr_cpu,
 659                                                         &entry->def_intr.mask);
 660                                 if (curr_cpu >= nr_cpu_ids)
 661                                         break;
 662                         }
 663
 664                         /*
 665                          * If there ends up being 0 CPU cores leftover for SDMA
 666                          * engines, use the same CPU cores as general/control
 667                          * context.
 668                          */
 669                         if (cpumask_empty(&entry->def_intr.mask))
 670                                 cpumask_copy(&entry->def_intr.mask,
 671                                              &entry->general_intr_mask);
 672                 }
 673
 674                 /* Determine completion vector CPUs for the entire node */
 675                 cpumask_and(&entry->comp_vect_mask,
 676                             &node_affinity.real_cpu_mask, local_mask);
 677                 cpumask_andnot(&entry->comp_vect_mask,
 678                                &entry->comp_vect_mask,
 679                                &entry->rcv_intr.mask);
 680                 cpumask_andnot(&entry->comp_vect_mask,
 681                                &entry->comp_vect_mask,
 682                                &entry->general_intr_mask);
 683
 684                 /*
 685                  * If there ends up being 0 CPU cores leftover for completion
 686                  * vectors, use the same CPU core as the general/control
 687                  * context.
 688                  */
 689                 if (cpumask_empty(&entry->comp_vect_mask))
 690                         cpumask_copy(&entry->comp_vect_mask,
 691                                      &entry->general_intr_mask);
 692         }
 693
 694         ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
 695         if (ret < 0)
 696                 goto fail;
 697
 698         if (new_entry)
 699                 node_affinity_add_tail(entry);
 700
 701         dd->affinity_entry = entry;
 702         mutex_unlock(&node_affinity.lock);
 703
 704         return 0;
 705
 706 fail:
 707         if (new_entry)
 708                 node_affinity_destroy(entry);
 709         mutex_unlock(&node_affinity.lock);
 710         return ret;
 711 }
 712
 713 void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
 714 {
 715         struct hfi1_affinity_node *entry;
 716
 717         mutex_lock(&node_affinity.lock);
 718         if (!dd->affinity_entry)
 719                 goto unlock;
 720         entry = node_affinity_lookup(dd->node);
 721         if (!entry)
 722                 goto unlock;
 723
 724         /*
 725          * Free device completion vector CPUs to be used by future
 726          * completion vectors
 727          */
 728         _dev_comp_vect_cpu_mask_clean_up(dd, entry);
 729 unlock:
 730         dd->affinity_entry = NULL;
 731         mutex_unlock(&node_affinity.lock);
 732 }
 733
 734 /*
 735  * Function updates the irq affinity hint for msix after it has been changed
 736  * by the user using the /proc/irq interface. This function only accepts
 737  * one cpu in the mask.
 738  */
 739 static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
 740 {
 741         struct sdma_engine *sde = msix->arg;
 742         struct hfi1_devdata *dd = sde->dd;
 743         struct hfi1_affinity_node *entry;
 744         struct cpu_mask_set *set;
 745         int i, old_cpu;
 746
 747         if (cpu > num_online_cpus() || cpu == sde->cpu)
 748                 return;
 749
 750         mutex_lock(&node_affinity.lock);
 751         entry = node_affinity_lookup(dd->node);
 752         if (!entry)
 753                 goto unlock;
 754
 755         old_cpu = sde->cpu;
 756         sde->cpu = cpu;
 757         cpumask_clear(&msix->mask);
 758         cpumask_set_cpu(cpu, &msix->mask);
 759         dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
 760                    msix->irq, irq_type_names[msix->type],
 761                    sde->this_idx, cpu);
 762         irq_set_affinity_hint(msix->irq, &msix->mask);
 763
 764         /*
 765          * Set the new cpu in the hfi1_affinity_node and clean
 766          * the old cpu if it is not used by any other IRQ
 767          */
 768         set = &entry->def_intr;
 769         cpumask_set_cpu(cpu, &set->mask);
 770         cpumask_set_cpu(cpu, &set->used);
 771         for (i = 0; i < dd->msix_info.max_requested; i++) {
 772                 struct hfi1_msix_entry *other_msix;
 773
 774                 other_msix = &dd->msix_info.msix_entries[i];
 775                 if (other_msix->type != IRQ_SDMA || other_msix == msix)
 776                         continue;
 777
 778                 if (cpumask_test_cpu(old_cpu, &other_msix->mask))
 779                         goto unlock;
 780         }
 781         cpumask_clear_cpu(old_cpu, &set->mask);
 782         cpumask_clear_cpu(old_cpu, &set->used);
 783 unlock:
 784         mutex_unlock(&node_affinity.lock);
 785 }
 786
 787 static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify,
 788                                      const cpumask_t *mask)
 789 {
 790         int cpu = cpumask_first(mask);
 791         struct hfi1_msix_entry *msix = container_of(notify,
 792                                                     struct hfi1_msix_entry,
 793                                                     notify);
 794
 795         /* Only one CPU configuration supported currently */
 796         hfi1_update_sdma_affinity(msix, cpu);
 797 }
 798
 799 static void hfi1_irq_notifier_release(struct kref *ref)
 800 {
 801         /*
 802          * This is required by affinity notifier. We don't have anything to
 803          * free here.
 804          */
 805 }
 806
 807 static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
 808 {
 809         struct irq_affinity_notify *notify = &msix->notify;
 810
 811         notify->irq = msix->irq;
 812         notify->notify = hfi1_irq_notifier_notify;
 813         notify->release = hfi1_irq_notifier_release;
 814
 815         if (irq_set_affinity_notifier(notify->irq, notify))
 816                 pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
 817                        notify->irq);
 818 }
 819
 820 static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix)
 821 {
 822         struct irq_affinity_notify *notify = &msix->notify;
 823
 824         if (irq_set_affinity_notifier(notify->irq, NULL))
 825                 pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
 826                        notify->irq);
 827 }
 828
 829 /*
 830  * Function sets the irq affinity for msix.
 831  * It *must* be called with node_affinity.lock held.
 832  */
 833 static int get_irq_affinity(struct hfi1_devdata *dd,
 834                             struct hfi1_msix_entry *msix)
 835 {
 836         cpumask_var_t diff;
 837         struct hfi1_affinity_node *entry;
 838         struct cpu_mask_set *set = NULL;
 839         struct sdma_engine *sde = NULL;
 840         struct hfi1_ctxtdata *rcd = NULL;
 841         char extra[64];
 842         int cpu = -1;
 843
 844         extra[0] = '\0';
 845         cpumask_clear(&msix->mask);
 846
 847         entry = node_affinity_lookup(dd->node);
 848
 849         switch (msix->type) {
 850         case IRQ_SDMA:
 851                 sde = (struct sdma_engine *)msix->arg;
 852                 scnprintf(extra, 64, "engine %u", sde->this_idx);
 853                 set = &entry->def_intr;
 854                 break;
 855         case IRQ_GENERAL:
 856                 cpu = cpumask_first(&entry->general_intr_mask);
 857                 break;
 858         case IRQ_RCVCTXT:
 859                 rcd = (struct hfi1_ctxtdata *)msix->arg;
 860                 if (rcd->ctxt == HFI1_CTRL_CTXT)
 861                         cpu = cpumask_first(&entry->general_intr_mask);
 862                 else
 863                         set = &entry->rcv_intr;
 864                 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
 865                 break;
 866         case IRQ_NETDEVCTXT:
 867                 rcd = (struct hfi1_ctxtdata *)msix->arg;
 868                 set = &entry->def_intr;
 869                 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
 870                 break;
 871         default:
 872                 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
 873                 return -EINVAL;
 874         }
 875
 876         /*
 877          * The general and control contexts are placed on a particular
 878          * CPU, which is set above. Skip accounting for it. Everything else
 879          * finds its CPU here.
 880          */
 881         if (cpu == -1 && set) {
 882                 if (!zalloc_cpumask_var(&diff, GFP_KERNEL))
 883                         return -ENOMEM;
 884
 885                 cpu = cpu_mask_set_get_first(set, diff);
 886                 if (cpu < 0) {
 887                         free_cpumask_var(diff);
 888                         dd_dev_err(dd, "Failure to obtain CPU for IRQ\n");
 889                         return cpu;
 890                 }
 891
 892                 free_cpumask_var(diff);
 893         }
 894
 895         cpumask_set_cpu(cpu, &msix->mask);
 896         dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
 897                     msix->irq, irq_type_names[msix->type],
 898                     extra, cpu);
 899         irq_set_affinity_hint(msix->irq, &msix->mask);
 900
 901         if (msix->type == IRQ_SDMA) {
 902                 sde->cpu = cpu;
 903                 hfi1_setup_sdma_notifier(msix);
 904         }
 905
 906         return 0;
 907 }
 908
 909 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 910 {
 911         int ret;
 912
 913         mutex_lock(&node_affinity.lock);
 914         ret = get_irq_affinity(dd, msix);
 915         mutex_unlock(&node_affinity.lock);
 916         return ret;
 917 }
 918
 919 void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
 920                            struct hfi1_msix_entry *msix)
 921 {
 922         struct cpu_mask_set *set = NULL;
 923         struct hfi1_affinity_node *entry;
 924
 925         mutex_lock(&node_affinity.lock);
 926         entry = node_affinity_lookup(dd->node);
 927
 928         switch (msix->type) {
 929         case IRQ_SDMA:
 930                 set = &entry->def_intr;
 931                 hfi1_cleanup_sdma_notifier(msix);
 932                 break;
 933         case IRQ_GENERAL:
 934                 /* Don't do accounting for general contexts */
 935                 break;
 936         case IRQ_RCVCTXT: {
 937                 struct hfi1_ctxtdata *rcd = msix->arg;
 938
 939                 /* Don't do accounting for control contexts */
 940                 if (rcd->ctxt != HFI1_CTRL_CTXT)
 941                         set = &entry->rcv_intr;
 942                 break;
 943         }
 944         case IRQ_NETDEVCTXT:
 945                 set = &entry->def_intr;
 946                 break;
 947         default:
 948                 mutex_unlock(&node_affinity.lock);
 949                 return;
 950         }
 951
 952         if (set) {
 953                 cpumask_andnot(&set->used, &set->used, &msix->mask);
 954                 _cpu_mask_set_gen_dec(set);
 955         }
 956
 957         irq_set_affinity_hint(msix->irq, NULL);
 958         cpumask_clear(&msix->mask);
 959         mutex_unlock(&node_affinity.lock);
 960 }
 961
 962 /* This should be called with node_affinity.lock held */
 963 static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
 964                                 struct hfi1_affinity_node_list *affinity)
 965 {
 966         int possible, curr_cpu, i;
 967         uint num_cores_per_socket = node_affinity.num_online_cpus /
 968                                         affinity->num_core_siblings /
 969                                                 node_affinity.num_online_nodes;
 970
 971         cpumask_copy(hw_thread_mask, &affinity->proc.mask);
 972         if (affinity->num_core_siblings > 0) {
 973                 /* Removing other siblings not needed for now */
 974                 possible = cpumask_weight(hw_thread_mask);
 975                 curr_cpu = cpumask_first(hw_thread_mask);
 976                 for (i = 0;
 977                      i < num_cores_per_socket * node_affinity.num_online_nodes;
 978                      i++)
 979                         curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
 980
 981                 for (; i < possible; i++) {
 982                         cpumask_clear_cpu(curr_cpu, hw_thread_mask);
 983                         curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
 984                 }
 985
 986                 /* Identifying correct HW threads within physical cores */
 987                 cpumask_shift_left(hw_thread_mask, hw_thread_mask,
 988                                    num_cores_per_socket *
 989                                    node_affinity.num_online_nodes *
 990                                    hw_thread_no);
 991         }
 992 }
 993
 994 int hfi1_get_proc_affinity(int node)
 995 {
 996         int cpu = -1, ret, i;
 997         struct hfi1_affinity_node *entry;
 998         cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
 999         const struct cpumask *node_mask,
1000                 *proc_mask = current->cpus_ptr;
1001         struct hfi1_affinity_node_list *affinity = &node_affinity;
1002         struct cpu_mask_set *set = &affinity->proc;
1003
1004         /*
1005          * check whether process/context affinity has already
1006          * been set
1007          */
1008         if (current->nr_cpus_allowed == 1) {
1009                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
1010                           current->pid, current->comm,
1011                           cpumask_pr_args(proc_mask));
1012                 /*
1013                  * Mark the pre-set CPU as used. This is atomic so we don't
1014                  * need the lock
1015                  */
1016                 cpu = cpumask_first(proc_mask);
1017                 cpumask_set_cpu(cpu, &set->used);
1018                 goto done;
1019         } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
1020                 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
1021                           current->pid, current->comm,
1022                           cpumask_pr_args(proc_mask));
1023                 goto done;
1024         }
1025
1026         /*
1027          * The process does not have a preset CPU affinity so find one to
1028          * recommend using the following algorithm:
1029          *
1030          * For each user process that is opening a context on HFI Y:
1031          *  a) If all cores are filled, reinitialize the bitmask
1032          *  b) Fill real cores first, then HT cores (First set of HT
1033          *     cores on all physical cores, then second set of HT core,
1034          *     and, so on) in the following order:
1035          *
1036          *     1. Same NUMA node as HFI Y and not running an IRQ
1037          *        handler
1038          *     2. Same NUMA node as HFI Y and running an IRQ handler
1039          *     3. Different NUMA node to HFI Y and not running an IRQ
1040          *        handler
1041          *     4. Different NUMA node to HFI Y and running an IRQ
1042          *        handler
1043          *  c) Mark core as filled in the bitmask. As user processes are
1044          *     done, clear cores from the bitmask.
1045          */
1046
1047         ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
1048         if (!ret)
1049                 goto done;
1050         ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
1051         if (!ret)
1052                 goto free_diff;
1053         ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
1054         if (!ret)
1055                 goto free_hw_thread_mask;
1056         ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
1057         if (!ret)
1058                 goto free_available_mask;
1059
1060         mutex_lock(&affinity->lock);
1061         /*
1062          * If we've used all available HW threads, clear the mask and start
1063          * overloading.
1064          */
1065         _cpu_mask_set_gen_inc(set);
1066
1067         /*
1068          * If NUMA node has CPUs used by interrupt handlers, include them in the
1069          * interrupt handler mask.
1070          */
1071         entry = node_affinity_lookup(node);
1072         if (entry) {
1073                 cpumask_copy(intrs_mask, (entry->def_intr.gen ?
1074                                           &entry->def_intr.mask :
1075                                           &entry->def_intr.used));
1076                 cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
1077                                                     &entry->rcv_intr.mask :
1078                                                     &entry->rcv_intr.used));
1079                 cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
1080         }
1081         hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
1082                   cpumask_pr_args(intrs_mask));
1083
1084         cpumask_copy(hw_thread_mask, &set->mask);
1085
1086         /*
1087          * If HT cores are enabled, identify which HW threads within the
1088          * physical cores should be used.
1089          */
1090         if (affinity->num_core_siblings > 0) {
1091                 for (i = 0; i < affinity->num_core_siblings; i++) {
1092                         find_hw_thread_mask(i, hw_thread_mask, affinity);
1093
1094                         /*
1095                          * If there's at least one available core for this HW
1096                          * thread number, stop looking for a core.
1097                          *
1098                          * diff will always be not empty at least once in this
1099                          * loop as the used mask gets reset when
1100                          * (set->mask == set->used) before this loop.
1101                          */
1102                         cpumask_andnot(diff, hw_thread_mask, &set->used);
1103                         if (!cpumask_empty(diff))
1104                                 break;
1105                 }
1106         }
1107         hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
1108                   cpumask_pr_args(hw_thread_mask));
1109
1110         node_mask = cpumask_of_node(node);
1111         hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
1112                   cpumask_pr_args(node_mask));
1113
1114         /* Get cpumask of available CPUs on preferred NUMA */
1115         cpumask_and(available_mask, hw_thread_mask, node_mask);
1116         cpumask_andnot(available_mask, available_mask, &set->used);
1117         hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
1118                   cpumask_pr_args(available_mask));
1119
1120         /*
1121          * At first, we don't want to place processes on the same
1122          * CPUs as interrupt handlers. Then, CPUs running interrupt
1123          * handlers are used.
1124          *
1125          * 1) If diff is not empty, then there are CPUs not running
1126          *    non-interrupt handlers available, so diff gets copied
1127          *    over to available_mask.
1128          * 2) If diff is empty, then all CPUs not running interrupt
1129          *    handlers are taken, so available_mask contains all
1130          *    available CPUs running interrupt handlers.
1131          * 3) If available_mask is empty, then all CPUs on the
1132          *    preferred NUMA node are taken, so other NUMA nodes are
1133          *    used for process assignments using the same method as
1134          *    the preferred NUMA node.
1135          */
1136         cpumask_andnot(diff, available_mask, intrs_mask);
1137         if (!cpumask_empty(diff))
1138                 cpumask_copy(available_mask, diff);
1139
1140         /* If we don't have CPUs on the preferred node, use other NUMA nodes */
1141         if (cpumask_empty(available_mask)) {
1142                 cpumask_andnot(available_mask, hw_thread_mask, &set->used);
1143                 /* Excluding preferred NUMA cores */
1144                 cpumask_andnot(available_mask, available_mask, node_mask);
1145                 hfi1_cdbg(PROC,
1146                           "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
1147                           cpumask_pr_args(available_mask));
1148
1149                 /*
1150                  * At first, we don't want to place processes on the same
1151                  * CPUs as interrupt handlers.
1152                  */
1153                 cpumask_andnot(diff, available_mask, intrs_mask);
1154                 if (!cpumask_empty(diff))
1155                         cpumask_copy(available_mask, diff);
1156         }
1157         hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
1158                   cpumask_pr_args(available_mask));
1159
1160         cpu = cpumask_first(available_mask);
1161         if (cpu >= nr_cpu_ids) /* empty */
1162                 cpu = -1;
1163         else
1164                 cpumask_set_cpu(cpu, &set->used);
1165
1166         mutex_unlock(&affinity->lock);
1167         hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
1168
1169         free_cpumask_var(intrs_mask);
1170 free_available_mask:
1171         free_cpumask_var(available_mask);
1172 free_hw_thread_mask:
1173         free_cpumask_var(hw_thread_mask);
1174 free_diff:
1175         free_cpumask_var(diff);
1176 done:
1177         return cpu;
1178 }
1179
1180 void hfi1_put_proc_affinity(int cpu)
1181 {
1182         struct hfi1_affinity_node_list *affinity = &node_affinity;
1183         struct cpu_mask_set *set = &affinity->proc;
1184
1185         if (cpu < 0)
1186                 return;
1187
1188         mutex_lock(&affinity->lock);
1189         cpu_mask_set_put(set, cpu);
1190         hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
1191         mutex_unlock(&affinity->lock);
1192 }