arch/x86/kernel/cpu/sgx/main.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*  Copyright(c) 2016-20 Intel Corporation. */
   3
   4 #include <linux/freezer.h>
   5 #include <linux/highmem.h>
   6 #include <linux/kthread.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/ratelimit.h>
   9 #include <linux/sched/mm.h>
  10 #include <linux/sched/signal.h>
  11 #include <linux/slab.h>
  12 #include "driver.h"
  13 #include "encl.h"
  14 #include "encls.h"
  15
  16 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
  17 static int sgx_nr_epc_sections;
  18 static struct task_struct *ksgxd_tsk;
  19 static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
  20
  21 /*
  22  * These variables are part of the state of the reclaimer, and must be accessed
  23  * with sgx_reclaimer_lock acquired.
  24  */
  25 static LIST_HEAD(sgx_active_page_list);
  26 static DEFINE_SPINLOCK(sgx_reclaimer_lock);
  27
  28 /* The free page list lock protected variables prepend the lock. */
  29 static unsigned long sgx_nr_free_pages;
  30
  31 /* Nodes with one or more EPC sections. */
  32 static nodemask_t sgx_numa_mask;
  33
  34 /*
  35  * Array with one list_head for each possible NUMA node.  Each
  36  * list contains all the sgx_epc_section's which are on that
  37  * node.
  38  */
  39 static struct sgx_numa_node *sgx_numa_nodes;
  40
  41 static LIST_HEAD(sgx_dirty_page_list);
  42
  43 /*
  44  * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
  45  * from the input list, and made available for the page allocator. SECS pages
  46  * prepending their children in the input list are left intact.
  47  */
  48 static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
  49 {
  50         struct sgx_epc_page *page;
  51         LIST_HEAD(dirty);
  52         int ret;
  53
  54         /* dirty_page_list is thread-local, no need for a lock: */
  55         while (!list_empty(dirty_page_list)) {
  56                 if (kthread_should_stop())
  57                         return;
  58
  59                 page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
  60
  61                 ret = __eremove(sgx_get_epc_virt_addr(page));
  62                 if (!ret) {
  63                         /*
  64                          * page is now sanitized.  Make it available via the SGX
  65                          * page allocator:
  66                          */
  67                         list_del(&page->list);
  68                         sgx_free_epc_page(page);
  69                 } else {
  70                         /* The page is not yet clean - move to the dirty list. */
  71                         list_move_tail(&page->list, &dirty);
  72                 }
  73
  74                 cond_resched();
  75         }
  76
  77         list_splice(&dirty, dirty_page_list);
  78 }
  79
  80 static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
  81 {
  82         struct sgx_encl_page *page = epc_page->owner;
  83         struct sgx_encl *encl = page->encl;
  84         struct sgx_encl_mm *encl_mm;
  85         bool ret = true;
  86         int idx;
  87
  88         idx = srcu_read_lock(&encl->srcu);
  89
  90         list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
  91                 if (!mmget_not_zero(encl_mm->mm))
  92                         continue;
  93
  94                 mmap_read_lock(encl_mm->mm);
  95                 ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
  96                 mmap_read_unlock(encl_mm->mm);
  97
  98                 mmput_async(encl_mm->mm);
  99
 100                 if (!ret)
 101                         break;
 102         }
 103
 104         srcu_read_unlock(&encl->srcu, idx);
 105
 106         if (!ret)
 107                 return false;
 108
 109         return true;
 110 }
 111
 112 static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
 113 {
 114         struct sgx_encl_page *page = epc_page->owner;
 115         unsigned long addr = page->desc & PAGE_MASK;
 116         struct sgx_encl *encl = page->encl;
 117         unsigned long mm_list_version;
 118         struct sgx_encl_mm *encl_mm;
 119         struct vm_area_struct *vma;
 120         int idx, ret;
 121
 122         do {
 123                 mm_list_version = encl->mm_list_version;
 124
 125                 /* Pairs with smp_rmb() in sgx_encl_mm_add(). */
 126                 smp_rmb();
 127
 128                 idx = srcu_read_lock(&encl->srcu);
 129
 130                 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
 131                         if (!mmget_not_zero(encl_mm->mm))
 132                                 continue;
 133
 134                         mmap_read_lock(encl_mm->mm);
 135
 136                         ret = sgx_encl_find(encl_mm->mm, addr, &vma);
 137                         if (!ret && encl == vma->vm_private_data)
 138                                 zap_vma_ptes(vma, addr, PAGE_SIZE);
 139
 140                         mmap_read_unlock(encl_mm->mm);
 141
 142                         mmput_async(encl_mm->mm);
 143                 }
 144
 145                 srcu_read_unlock(&encl->srcu, idx);
 146         } while (unlikely(encl->mm_list_version != mm_list_version));
 147
 148         mutex_lock(&encl->lock);
 149
 150         ret = __eblock(sgx_get_epc_virt_addr(epc_page));
 151         if (encls_failed(ret))
 152                 ENCLS_WARN(ret, "EBLOCK");
 153
 154         mutex_unlock(&encl->lock);
 155 }
 156
 157 static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
 158                           struct sgx_backing *backing)
 159 {
 160         struct sgx_pageinfo pginfo;
 161         int ret;
 162
 163         pginfo.addr = 0;
 164         pginfo.secs = 0;
 165
 166         pginfo.contents = (unsigned long)kmap_atomic(backing->contents);
 167         pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) +
 168                           backing->pcmd_offset;
 169
 170         ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
 171
 172         kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
 173                                               backing->pcmd_offset));
 174         kunmap_atomic((void *)(unsigned long)pginfo.contents);
 175
 176         return ret;
 177 }
 178
 179 static void sgx_ipi_cb(void *info)
 180 {
 181 }
 182
 183 static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl)
 184 {
 185         cpumask_t *cpumask = &encl->cpumask;
 186         struct sgx_encl_mm *encl_mm;
 187         int idx;
 188
 189         /*
 190          * Can race with sgx_encl_mm_add(), but ETRACK has already been
 191          * executed, which means that the CPUs running in the new mm will enter
 192          * into the enclave with a fresh epoch.
 193          */
 194         cpumask_clear(cpumask);
 195
 196         idx = srcu_read_lock(&encl->srcu);
 197
 198         list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
 199                 if (!mmget_not_zero(encl_mm->mm))
 200                         continue;
 201
 202                 cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
 203
 204                 mmput_async(encl_mm->mm);
 205         }
 206
 207         srcu_read_unlock(&encl->srcu, idx);
 208
 209         return cpumask;
 210 }
 211
 212 /*
 213  * Swap page to the regular memory transformed to the blocked state by using
 214  * EBLOCK, which means that it can no loger be referenced (no new TLB entries).
 215  *
 216  * The first trial just tries to write the page assuming that some other thread
 217  * has reset the count for threads inside the enlave by using ETRACK, and
 218  * previous thread count has been zeroed out. The second trial calls ETRACK
 219  * before EWB. If that fails we kick all the HW threads out, and then do EWB,
 220  * which should be guaranteed the succeed.
 221  */
 222 static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
 223                          struct sgx_backing *backing)
 224 {
 225         struct sgx_encl_page *encl_page = epc_page->owner;
 226         struct sgx_encl *encl = encl_page->encl;
 227         struct sgx_va_page *va_page;
 228         unsigned int va_offset;
 229         void *va_slot;
 230         int ret;
 231
 232         encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
 233
 234         va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
 235                                    list);
 236         va_offset = sgx_alloc_va_slot(va_page);
 237         va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
 238         if (sgx_va_page_full(va_page))
 239                 list_move_tail(&va_page->list, &encl->va_pages);
 240
 241         ret = __sgx_encl_ewb(epc_page, va_slot, backing);
 242         if (ret == SGX_NOT_TRACKED) {
 243                 ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
 244                 if (ret) {
 245                         if (encls_failed(ret))
 246                                 ENCLS_WARN(ret, "ETRACK");
 247                 }
 248
 249                 ret = __sgx_encl_ewb(epc_page, va_slot, backing);
 250                 if (ret == SGX_NOT_TRACKED) {
 251                         /*
 252                          * Slow path, send IPIs to kick cpus out of the
 253                          * enclave.  Note, it's imperative that the cpu
 254                          * mask is generated *after* ETRACK, else we'll
 255                          * miss cpus that entered the enclave between
 256                          * generating the mask and incrementing epoch.
 257                          */
 258                         on_each_cpu_mask(sgx_encl_ewb_cpumask(encl),
 259                                          sgx_ipi_cb, NULL, 1);
 260                         ret = __sgx_encl_ewb(epc_page, va_slot, backing);
 261                 }
 262         }
 263
 264         if (ret) {
 265                 if (encls_failed(ret))
 266                         ENCLS_WARN(ret, "EWB");
 267
 268                 sgx_free_va_slot(va_page, va_offset);
 269         } else {
 270                 encl_page->desc |= va_offset;
 271                 encl_page->va_page = va_page;
 272         }
 273 }
 274
 275 static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
 276                                 struct sgx_backing *backing)
 277 {
 278         struct sgx_encl_page *encl_page = epc_page->owner;
 279         struct sgx_encl *encl = encl_page->encl;
 280         struct sgx_backing secs_backing;
 281         int ret;
 282
 283         mutex_lock(&encl->lock);
 284
 285         sgx_encl_ewb(epc_page, backing);
 286         encl_page->epc_page = NULL;
 287         encl->secs_child_cnt--;
 288
 289         if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
 290                 ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size),
 291                                            &secs_backing);
 292                 if (ret)
 293                         goto out;
 294
 295                 sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
 296
 297                 sgx_encl_free_epc_page(encl->secs.epc_page);
 298                 encl->secs.epc_page = NULL;
 299
 300                 sgx_encl_put_backing(&secs_backing, true);
 301         }
 302
 303 out:
 304         mutex_unlock(&encl->lock);
 305 }
 306
 307 /*
 308  * Take a fixed number of pages from the head of the active page pool and
 309  * reclaim them to the enclave's private shmem files. Skip the pages, which have
 310  * been accessed since the last scan. Move those pages to the tail of active
 311  * page pool so that the pages get scanned in LRU like fashion.
 312  *
 313  * Batch process a chunk of pages (at the moment 16) in order to degrade amount
 314  * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
 315  * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
 316  * + EWB) but not sufficiently. Reclaiming one page at a time would also be
 317  * problematic as it would increase the lock contention too much, which would
 318  * halt forward progress.
 319  */
 320 static void sgx_reclaim_pages(void)
 321 {
 322         struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
 323         struct sgx_backing backing[SGX_NR_TO_SCAN];
 324         struct sgx_epc_section *section;
 325         struct sgx_encl_page *encl_page;
 326         struct sgx_epc_page *epc_page;
 327         struct sgx_numa_node *node;
 328         pgoff_t page_index;
 329         int cnt = 0;
 330         int ret;
 331         int i;
 332
 333         spin_lock(&sgx_reclaimer_lock);
 334         for (i = 0; i < SGX_NR_TO_SCAN; i++) {
 335                 if (list_empty(&sgx_active_page_list))
 336                         break;
 337
 338                 epc_page = list_first_entry(&sgx_active_page_list,
 339                                             struct sgx_epc_page, list);
 340                 list_del_init(&epc_page->list);
 341                 encl_page = epc_page->owner;
 342
 343                 if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
 344                         chunk[cnt++] = epc_page;
 345                 else
 346                         /* The owner is freeing the page. No need to add the
 347                          * page back to the list of reclaimable pages.
 348                          */
 349                         epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
 350         }
 351         spin_unlock(&sgx_reclaimer_lock);
 352
 353         for (i = 0; i < cnt; i++) {
 354                 epc_page = chunk[i];
 355                 encl_page = epc_page->owner;
 356
 357                 if (!sgx_reclaimer_age(epc_page))
 358                         goto skip;
 359
 360                 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
 361                 ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]);
 362                 if (ret)
 363                         goto skip;
 364
 365                 mutex_lock(&encl_page->encl->lock);
 366                 encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
 367                 mutex_unlock(&encl_page->encl->lock);
 368                 continue;
 369
 370 skip:
 371                 spin_lock(&sgx_reclaimer_lock);
 372                 list_add_tail(&epc_page->list, &sgx_active_page_list);
 373                 spin_unlock(&sgx_reclaimer_lock);
 374
 375                 kref_put(&encl_page->encl->refcount, sgx_encl_release);
 376
 377                 chunk[i] = NULL;
 378         }
 379
 380         for (i = 0; i < cnt; i++) {
 381                 epc_page = chunk[i];
 382                 if (epc_page)
 383                         sgx_reclaimer_block(epc_page);
 384         }
 385
 386         for (i = 0; i < cnt; i++) {
 387                 epc_page = chunk[i];
 388                 if (!epc_page)
 389                         continue;
 390
 391                 encl_page = epc_page->owner;
 392                 sgx_reclaimer_write(epc_page, &backing[i]);
 393                 sgx_encl_put_backing(&backing[i], true);
 394
 395                 kref_put(&encl_page->encl->refcount, sgx_encl_release);
 396                 epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
 397
 398                 section = &sgx_epc_sections[epc_page->section];
 399                 node = section->node;
 400
 401                 spin_lock(&node->lock);
 402                 list_add_tail(&epc_page->list, &node->free_page_list);
 403                 sgx_nr_free_pages++;
 404                 spin_unlock(&node->lock);
 405         }
 406 }
 407
 408 static bool sgx_should_reclaim(unsigned long watermark)
 409 {
 410         return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
 411 }
 412
 413 static int ksgxd(void *p)
 414 {
 415         set_freezable();
 416
 417         /*
 418          * Sanitize pages in order to recover from kexec(). The 2nd pass is
 419          * required for SECS pages, whose child pages blocked EREMOVE.
 420          */
 421         __sgx_sanitize_pages(&sgx_dirty_page_list);
 422         __sgx_sanitize_pages(&sgx_dirty_page_list);
 423
 424         /* sanity check: */
 425         WARN_ON(!list_empty(&sgx_dirty_page_list));
 426
 427         while (!kthread_should_stop()) {
 428                 if (try_to_freeze())
 429                         continue;
 430
 431                 wait_event_freezable(ksgxd_waitq,
 432                                      kthread_should_stop() ||
 433                                      sgx_should_reclaim(SGX_NR_HIGH_PAGES));
 434
 435                 if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
 436                         sgx_reclaim_pages();
 437
 438                 cond_resched();
 439         }
 440
 441         return 0;
 442 }
 443
 444 static bool __init sgx_page_reclaimer_init(void)
 445 {
 446         struct task_struct *tsk;
 447
 448         tsk = kthread_run(ksgxd, NULL, "ksgxd");
 449         if (IS_ERR(tsk))
 450                 return false;
 451
 452         ksgxd_tsk = tsk;
 453
 454         return true;
 455 }
 456
 457 static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
 458 {
 459         struct sgx_numa_node *node = &sgx_numa_nodes[nid];
 460         struct sgx_epc_page *page = NULL;
 461
 462         spin_lock(&node->lock);
 463
 464         if (list_empty(&node->free_page_list)) {
 465                 spin_unlock(&node->lock);
 466                 return NULL;
 467         }
 468
 469         page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
 470         list_del_init(&page->list);
 471         sgx_nr_free_pages--;
 472
 473         spin_unlock(&node->lock);
 474
 475         return page;
 476 }
 477
 478 /**
 479  * __sgx_alloc_epc_page() - Allocate an EPC page
 480  *
 481  * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
 482  * from the NUMA node, where the caller is executing.
 483  *
 484  * Return:
 485  * - an EPC page:       A borrowed EPC pages were available.
 486  * - NULL:              Out of EPC pages.
 487  */
 488 struct sgx_epc_page *__sgx_alloc_epc_page(void)
 489 {
 490         struct sgx_epc_page *page;
 491         int nid_of_current = numa_node_id();
 492         int nid = nid_of_current;
 493
 494         if (node_isset(nid_of_current, sgx_numa_mask)) {
 495                 page = __sgx_alloc_epc_page_from_node(nid_of_current);
 496                 if (page)
 497                         return page;
 498         }
 499
 500         /* Fall back to the non-local NUMA nodes: */
 501         while (true) {
 502                 nid = next_node_in(nid, sgx_numa_mask);
 503                 if (nid == nid_of_current)
 504                         break;
 505
 506                 page = __sgx_alloc_epc_page_from_node(nid);
 507                 if (page)
 508                         return page;
 509         }
 510
 511         return ERR_PTR(-ENOMEM);
 512 }
 513
 514 /**
 515  * sgx_mark_page_reclaimable() - Mark a page as reclaimable
 516  * @page:       EPC page
 517  *
 518  * Mark a page as reclaimable and add it to the active page list. Pages
 519  * are automatically removed from the active list when freed.
 520  */
 521 void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
 522 {
 523         spin_lock(&sgx_reclaimer_lock);
 524         page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
 525         list_add_tail(&page->list, &sgx_active_page_list);
 526         spin_unlock(&sgx_reclaimer_lock);
 527 }
 528
 529 /**
 530  * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
 531  * @page:       EPC page
 532  *
 533  * Clear the reclaimable flag and remove the page from the active page list.
 534  *
 535  * Return:
 536  *   0 on success,
 537  *   -EBUSY if the page is in the process of being reclaimed
 538  */
 539 int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
 540 {
 541         spin_lock(&sgx_reclaimer_lock);
 542         if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
 543                 /* The page is being reclaimed. */
 544                 if (list_empty(&page->list)) {
 545                         spin_unlock(&sgx_reclaimer_lock);
 546                         return -EBUSY;
 547                 }
 548
 549                 list_del(&page->list);
 550                 page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
 551         }
 552         spin_unlock(&sgx_reclaimer_lock);
 553
 554         return 0;
 555 }
 556
 557 /**
 558  * sgx_alloc_epc_page() - Allocate an EPC page
 559  * @owner:      the owner of the EPC page
 560  * @reclaim:    reclaim pages if necessary
 561  *
 562  * Iterate through EPC sections and borrow a free EPC page to the caller. When a
 563  * page is no longer needed it must be released with sgx_free_epc_page(). If
 564  * @reclaim is set to true, directly reclaim pages when we are out of pages. No
 565  * mm's can be locked when @reclaim is set to true.
 566  *
 567  * Finally, wake up ksgxd when the number of pages goes below the watermark
 568  * before returning back to the caller.
 569  *
 570  * Return:
 571  *   an EPC page,
 572  *   -errno on error
 573  */
 574 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
 575 {
 576         struct sgx_epc_page *page;
 577
 578         for ( ; ; ) {
 579                 page = __sgx_alloc_epc_page();
 580                 if (!IS_ERR(page)) {
 581                         page->owner = owner;
 582                         break;
 583                 }
 584
 585                 if (list_empty(&sgx_active_page_list))
 586                         return ERR_PTR(-ENOMEM);
 587
 588                 if (!reclaim) {
 589                         page = ERR_PTR(-EBUSY);
 590                         break;
 591                 }
 592
 593                 if (signal_pending(current)) {
 594                         page = ERR_PTR(-ERESTARTSYS);
 595                         break;
 596                 }
 597
 598                 sgx_reclaim_pages();
 599                 cond_resched();
 600         }
 601
 602         if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
 603                 wake_up(&ksgxd_waitq);
 604
 605         return page;
 606 }
 607
 608 /**
 609  * sgx_free_epc_page() - Free an EPC page
 610  * @page:       an EPC page
 611  *
 612  * Put the EPC page back to the list of free pages. It's the caller's
 613  * responsibility to make sure that the page is in uninitialized state. In other
 614  * words, do EREMOVE, EWB or whatever operation is necessary before calling
 615  * this function.
 616  */
 617 void sgx_free_epc_page(struct sgx_epc_page *page)
 618 {
 619         struct sgx_epc_section *section = &sgx_epc_sections[page->section];
 620         struct sgx_numa_node *node = section->node;
 621
 622         spin_lock(&node->lock);
 623
 624         list_add_tail(&page->list, &node->free_page_list);
 625         sgx_nr_free_pages++;
 626
 627         spin_unlock(&node->lock);
 628 }
 629
 630 static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
 631                                          unsigned long index,
 632                                          struct sgx_epc_section *section)
 633 {
 634         unsigned long nr_pages = size >> PAGE_SHIFT;
 635         unsigned long i;
 636
 637         section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
 638         if (!section->virt_addr)
 639                 return false;
 640
 641         section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
 642         if (!section->pages) {
 643                 memunmap(section->virt_addr);
 644                 return false;
 645         }
 646
 647         section->phys_addr = phys_addr;
 648
 649         for (i = 0; i < nr_pages; i++) {
 650                 section->pages[i].section = index;
 651                 section->pages[i].flags = 0;
 652                 section->pages[i].owner = NULL;
 653                 list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
 654         }
 655
 656         sgx_nr_free_pages += nr_pages;
 657         return true;
 658 }
 659
 660 /**
 661  * A section metric is concatenated in a way that @low bits 12-31 define the
 662  * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
 663  * metric.
 664  */
 665 static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
 666 {
 667         return (low & GENMASK_ULL(31, 12)) +
 668                ((high & GENMASK_ULL(19, 0)) << 32);
 669 }
 670
 671 static bool __init sgx_page_cache_init(void)
 672 {
 673         u32 eax, ebx, ecx, edx, type;
 674         u64 pa, size;
 675         int nid;
 676         int i;
 677
 678         sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
 679         if (!sgx_numa_nodes)
 680                 return false;
 681
 682         for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
 683                 cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
 684
 685                 type = eax & SGX_CPUID_EPC_MASK;
 686                 if (type == SGX_CPUID_EPC_INVALID)
 687                         break;
 688
 689                 if (type != SGX_CPUID_EPC_SECTION) {
 690                         pr_err_once("Unknown EPC section type: %u\n", type);
 691                         break;
 692                 }
 693
 694                 pa   = sgx_calc_section_metric(eax, ebx);
 695                 size = sgx_calc_section_metric(ecx, edx);
 696
 697                 pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
 698
 699                 if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
 700                         pr_err("No free memory for an EPC section\n");
 701                         break;
 702                 }
 703
 704                 nid = numa_map_to_online_node(phys_to_target_node(pa));
 705                 if (nid == NUMA_NO_NODE) {
 706                         /* The physical address is already printed above. */
 707                         pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
 708                         nid = 0;
 709                 }
 710
 711                 if (!node_isset(nid, sgx_numa_mask)) {
 712                         spin_lock_init(&sgx_numa_nodes[nid].lock);
 713                         INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
 714                         node_set(nid, sgx_numa_mask);
 715                 }
 716
 717                 sgx_epc_sections[i].node =  &sgx_numa_nodes[nid];
 718
 719                 sgx_nr_epc_sections++;
 720         }
 721
 722         if (!sgx_nr_epc_sections) {
 723                 pr_err("There are zero EPC sections.\n");
 724                 return false;
 725         }
 726
 727         return true;
 728 }
 729
 730 /*
 731  * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
 732  * Bare-metal driver requires to update them to hash of enclave's signer
 733  * before EINIT. KVM needs to update them to guest's virtual MSR values
 734  * before doing EINIT from guest.
 735  */
 736 void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
 737 {
 738         int i;
 739
 740         WARN_ON_ONCE(preemptible());
 741
 742         for (i = 0; i < 4; i++)
 743                 wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
 744 }
 745
 746 static int __init sgx_init(void)
 747 {
 748         int ret;
 749         int i;
 750
 751         if (!cpu_feature_enabled(X86_FEATURE_SGX))
 752                 return -ENODEV;
 753
 754         if (!sgx_page_cache_init())
 755                 return -ENOMEM;
 756
 757         if (!sgx_page_reclaimer_init()) {
 758                 ret = -ENOMEM;
 759                 goto err_page_cache;
 760         }
 761
 762         /*
 763          * Always try to initialize the native *and* KVM drivers.
 764          * The KVM driver is less picky than the native one and
 765          * can function if the native one is not supported on the
 766          * current system or fails to initialize.
 767          *
 768          * Error out only if both fail to initialize.
 769          */
 770         ret = sgx_drv_init();
 771
 772         if (sgx_vepc_init() && ret)
 773                 goto err_kthread;
 774
 775         return 0;
 776
 777 err_kthread:
 778         kthread_stop(ksgxd_tsk);
 779
 780 err_page_cache:
 781         for (i = 0; i < sgx_nr_epc_sections; i++) {
 782                 vfree(sgx_epc_sections[i].pages);
 783                 memunmap(sgx_epc_sections[i].virt_addr);
 784         }
 785
 786         return ret;
 787 }
 788
 789 device_initcall(sgx_init);