drivers/misc/habanalabs/common/memory.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 /*
   4  * Copyright 2016-2019 HabanaLabs, Ltd.
   5  * All Rights Reserved.
   6  */
   7
   8 #include <uapi/misc/habanalabs.h>
   9 #include "habanalabs.h"
  10 #include "../include/hw_ip/mmu/mmu_general.h"
  11
  12 #include <linux/uaccess.h>
  13 #include <linux/slab.h>
  14 #include <linux/genalloc.h>
  15
  16 #define HL_MMU_DEBUG    0
  17
  18 /*
  19  * The va ranges in context object contain a list with the available chunks of
  20  * device virtual memory.
  21  * There is one range for host allocations and one for DRAM allocations.
  22  *
  23  * On initialization each range contains one chunk of all of its available
  24  * virtual range which is a half of the total device virtual range.
  25  *
  26  * On each mapping of physical pages, a suitable virtual range chunk (with a
  27  * minimum size) is selected from the list. If the chunk size equals the
  28  * requested size, the chunk is returned. Otherwise, the chunk is split into
  29  * two chunks - one to return as result and a remainder to stay in the list.
  30  *
  31  * On each Unmapping of a virtual address, the relevant virtual chunk is
  32  * returned to the list. The chunk is added to the list and if its edges match
  33  * the edges of the adjacent chunks (means a contiguous chunk can be created),
  34  * the chunks are merged.
  35  *
  36  * On finish, the list is checked to have only one chunk of all the relevant
  37  * virtual range (which is a half of the device total virtual range).
  38  * If not (means not all mappings were unmapped), a warning is printed.
  39  */
  40
  41 /*
  42  * alloc_device_memory - allocate device memory
  43  *
  44  * @ctx                 : current context
  45  * @args                : host parameters containing the requested size
  46  * @ret_handle          : result handle
  47  *
  48  * This function does the following:
  49  * - Allocate the requested size rounded up to 2MB pages
  50  * - Return unique handle
  51  */
  52 static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
  53                                 u32 *ret_handle)
  54 {
  55         struct hl_device *hdev = ctx->hdev;
  56         struct hl_vm *vm = &hdev->vm;
  57         struct hl_vm_phys_pg_pack *phys_pg_pack;
  58         u64 paddr = 0, total_size, num_pgs, i;
  59         u32 num_curr_pgs, page_size, page_shift;
  60         int handle, rc;
  61         bool contiguous;
  62
  63         num_curr_pgs = 0;
  64         page_size = hdev->asic_prop.dram_page_size;
  65         page_shift = __ffs(page_size);
  66         num_pgs = (args->alloc.mem_size + (page_size - 1)) >> page_shift;
  67         total_size = num_pgs << page_shift;
  68
  69         if (!total_size) {
  70                 dev_err(hdev->dev, "Cannot allocate 0 bytes\n");
  71                 return -EINVAL;
  72         }
  73
  74         contiguous = args->flags & HL_MEM_CONTIGUOUS;
  75
  76         if (contiguous) {
  77                 paddr = (u64) gen_pool_alloc(vm->dram_pg_pool, total_size);
  78                 if (!paddr) {
  79                         dev_err(hdev->dev,
  80                                 "failed to allocate %llu huge contiguous pages\n",
  81                                 num_pgs);
  82                         return -ENOMEM;
  83                 }
  84         }
  85
  86         phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
  87         if (!phys_pg_pack) {
  88                 rc = -ENOMEM;
  89                 goto pages_pack_err;
  90         }
  91
  92         phys_pg_pack->vm_type = VM_TYPE_PHYS_PACK;
  93         phys_pg_pack->asid = ctx->asid;
  94         phys_pg_pack->npages = num_pgs;
  95         phys_pg_pack->page_size = page_size;
  96         phys_pg_pack->total_size = total_size;
  97         phys_pg_pack->flags = args->flags;
  98         phys_pg_pack->contiguous = contiguous;
  99
 100         phys_pg_pack->pages = kvmalloc_array(num_pgs, sizeof(u64), GFP_KERNEL);
 101         if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
 102                 rc = -ENOMEM;
 103                 goto pages_arr_err;
 104         }
 105
 106         if (phys_pg_pack->contiguous) {
 107                 for (i = 0 ; i < num_pgs ; i++)
 108                         phys_pg_pack->pages[i] = paddr + i * page_size;
 109         } else {
 110                 for (i = 0 ; i < num_pgs ; i++) {
 111                         phys_pg_pack->pages[i] = (u64) gen_pool_alloc(
 112                                                         vm->dram_pg_pool,
 113                                                         page_size);
 114                         if (!phys_pg_pack->pages[i]) {
 115                                 dev_err(hdev->dev,
 116                                         "Failed to allocate device memory (out of memory)\n");
 117                                 rc = -ENOMEM;
 118                                 goto page_err;
 119                         }
 120
 121                         num_curr_pgs++;
 122                 }
 123         }
 124
 125         spin_lock(&vm->idr_lock);
 126         handle = idr_alloc(&vm->phys_pg_pack_handles, phys_pg_pack, 1, 0,
 127                                 GFP_ATOMIC);
 128         spin_unlock(&vm->idr_lock);
 129
 130         if (handle < 0) {
 131                 dev_err(hdev->dev, "Failed to get handle for page\n");
 132                 rc = -EFAULT;
 133                 goto idr_err;
 134         }
 135
 136         for (i = 0 ; i < num_pgs ; i++)
 137                 kref_get(&vm->dram_pg_pool_refcount);
 138
 139         phys_pg_pack->handle = handle;
 140
 141         atomic64_add(phys_pg_pack->total_size, &ctx->dram_phys_mem);
 142         atomic64_add(phys_pg_pack->total_size, &hdev->dram_used_mem);
 143
 144         *ret_handle = handle;
 145
 146         return 0;
 147
 148 idr_err:
 149 page_err:
 150         if (!phys_pg_pack->contiguous)
 151                 for (i = 0 ; i < num_curr_pgs ; i++)
 152                         gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[i],
 153                                         page_size);
 154
 155         kvfree(phys_pg_pack->pages);
 156 pages_arr_err:
 157         kfree(phys_pg_pack);
 158 pages_pack_err:
 159         if (contiguous)
 160                 gen_pool_free(vm->dram_pg_pool, paddr, total_size);
 161
 162         return rc;
 163 }
 164
 165 /*
 166  * dma_map_host_va - DMA mapping of the given host virtual address.
 167  * @hdev: habanalabs device structure
 168  * @addr: the host virtual address of the memory area
 169  * @size: the size of the memory area
 170  * @p_userptr: pointer to result userptr structure
 171  *
 172  * This function does the following:
 173  * - Allocate userptr structure
 174  * - Pin the given host memory using the userptr structure
 175  * - Perform DMA mapping to have the DMA addresses of the pages
 176  */
 177 static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,
 178                                 struct hl_userptr **p_userptr)
 179 {
 180         struct hl_userptr *userptr;
 181         int rc;
 182
 183         userptr = kzalloc(sizeof(*userptr), GFP_KERNEL);
 184         if (!userptr) {
 185                 rc = -ENOMEM;
 186                 goto userptr_err;
 187         }
 188
 189         rc = hl_pin_host_memory(hdev, addr, size, userptr);
 190         if (rc) {
 191                 dev_err(hdev->dev, "Failed to pin host memory\n");
 192                 goto pin_err;
 193         }
 194
 195         rc = hdev->asic_funcs->asic_dma_map_sg(hdev, userptr->sgt->sgl,
 196                                         userptr->sgt->nents, DMA_BIDIRECTIONAL);
 197         if (rc) {
 198                 dev_err(hdev->dev, "failed to map sgt with DMA region\n");
 199                 goto dma_map_err;
 200         }
 201
 202         userptr->dma_mapped = true;
 203         userptr->dir = DMA_BIDIRECTIONAL;
 204         userptr->vm_type = VM_TYPE_USERPTR;
 205
 206         *p_userptr = userptr;
 207
 208         return 0;
 209
 210 dma_map_err:
 211         hl_unpin_host_memory(hdev, userptr);
 212 pin_err:
 213         kfree(userptr);
 214 userptr_err:
 215
 216         return rc;
 217 }
 218
 219 /*
 220  * dma_unmap_host_va - DMA unmapping of the given host virtual address.
 221  * @hdev: habanalabs device structure
 222  * @userptr: userptr to free
 223  *
 224  * This function does the following:
 225  * - Unpins the physical pages
 226  * - Frees the userptr structure
 227  */
 228 static void dma_unmap_host_va(struct hl_device *hdev,
 229                                 struct hl_userptr *userptr)
 230 {
 231         hl_unpin_host_memory(hdev, userptr);
 232         kfree(userptr);
 233 }
 234
 235 /*
 236  * dram_pg_pool_do_release - free DRAM pages pool
 237  *
 238  * @ref                 : pointer to reference object
 239  *
 240  * This function does the following:
 241  * - Frees the idr structure of physical pages handles
 242  * - Frees the generic pool of DRAM physical pages
 243  */
 244 static void dram_pg_pool_do_release(struct kref *ref)
 245 {
 246         struct hl_vm *vm = container_of(ref, struct hl_vm,
 247                         dram_pg_pool_refcount);
 248
 249         /*
 250          * free the idr here as only here we know for sure that there are no
 251          * allocated physical pages and hence there are no handles in use
 252          */
 253         idr_destroy(&vm->phys_pg_pack_handles);
 254         gen_pool_destroy(vm->dram_pg_pool);
 255 }
 256
 257 /*
 258  * free_phys_pg_pack - free physical page pack
 259  * @hdev: habanalabs device structure
 260  * @phys_pg_pack: physical page pack to free
 261  *
 262  * This function does the following:
 263  * - For DRAM memory only, iterate over the pack and free each physical block
 264  *   structure by returning it to the general pool
 265  * - Free the hl_vm_phys_pg_pack structure
 266  */
 267 static void free_phys_pg_pack(struct hl_device *hdev,
 268                                 struct hl_vm_phys_pg_pack *phys_pg_pack)
 269 {
 270         struct hl_vm *vm = &hdev->vm;
 271         u64 i;
 272
 273         if (!phys_pg_pack->created_from_userptr) {
 274                 if (phys_pg_pack->contiguous) {
 275                         gen_pool_free(vm->dram_pg_pool, phys_pg_pack->pages[0],
 276                                         phys_pg_pack->total_size);
 277
 278                         for (i = 0; i < phys_pg_pack->npages ; i++)
 279                                 kref_put(&vm->dram_pg_pool_refcount,
 280                                         dram_pg_pool_do_release);
 281                 } else {
 282                         for (i = 0 ; i < phys_pg_pack->npages ; i++) {
 283                                 gen_pool_free(vm->dram_pg_pool,
 284                                                 phys_pg_pack->pages[i],
 285                                                 phys_pg_pack->page_size);
 286                                 kref_put(&vm->dram_pg_pool_refcount,
 287                                         dram_pg_pool_do_release);
 288                         }
 289                 }
 290         }
 291
 292         kvfree(phys_pg_pack->pages);
 293         kfree(phys_pg_pack);
 294 }
 295
 296 /*
 297  * free_device_memory - free device memory
 298  *
 299  * @ctx                  : current context
 300  * @handle              : handle of the memory chunk to free
 301  *
 302  * This function does the following:
 303  * - Free the device memory related to the given handle
 304  */
 305 static int free_device_memory(struct hl_ctx *ctx, u32 handle)
 306 {
 307         struct hl_device *hdev = ctx->hdev;
 308         struct hl_vm *vm = &hdev->vm;
 309         struct hl_vm_phys_pg_pack *phys_pg_pack;
 310
 311         spin_lock(&vm->idr_lock);
 312         phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
 313         if (phys_pg_pack) {
 314                 if (atomic_read(&phys_pg_pack->mapping_cnt) > 0) {
 315                         dev_err(hdev->dev, "handle %u is mapped, cannot free\n",
 316                                 handle);
 317                         spin_unlock(&vm->idr_lock);
 318                         return -EINVAL;
 319                 }
 320
 321                 /*
 322                  * must remove from idr before the freeing of the physical
 323                  * pages as the refcount of the pool is also the trigger of the
 324                  * idr destroy
 325                  */
 326                 idr_remove(&vm->phys_pg_pack_handles, handle);
 327                 spin_unlock(&vm->idr_lock);
 328
 329                 atomic64_sub(phys_pg_pack->total_size, &ctx->dram_phys_mem);
 330                 atomic64_sub(phys_pg_pack->total_size, &hdev->dram_used_mem);
 331
 332                 free_phys_pg_pack(hdev, phys_pg_pack);
 333         } else {
 334                 spin_unlock(&vm->idr_lock);
 335                 dev_err(hdev->dev,
 336                         "free device memory failed, no match for handle %u\n",
 337                         handle);
 338                 return -EINVAL;
 339         }
 340
 341         return 0;
 342 }
 343
 344 /*
 345  * clear_va_list_locked - free virtual addresses list
 346  *
 347  * @hdev                : habanalabs device structure
 348  * @va_list             : list of virtual addresses to free
 349  *
 350  * This function does the following:
 351  * - Iterate over the list and free each virtual addresses block
 352  *
 353  * This function should be called only when va_list lock is taken
 354  */
 355 static void clear_va_list_locked(struct hl_device *hdev,
 356                 struct list_head *va_list)
 357 {
 358         struct hl_vm_va_block *va_block, *tmp;
 359
 360         list_for_each_entry_safe(va_block, tmp, va_list, node) {
 361                 list_del(&va_block->node);
 362                 kfree(va_block);
 363         }
 364 }
 365
 366 /*
 367  * print_va_list_locked    - print virtual addresses list
 368  *
 369  * @hdev                : habanalabs device structure
 370  * @va_list             : list of virtual addresses to print
 371  *
 372  * This function does the following:
 373  * - Iterate over the list and print each virtual addresses block
 374  *
 375  * This function should be called only when va_list lock is taken
 376  */
 377 static void print_va_list_locked(struct hl_device *hdev,
 378                 struct list_head *va_list)
 379 {
 380 #if HL_MMU_DEBUG
 381         struct hl_vm_va_block *va_block;
 382
 383         dev_dbg(hdev->dev, "print va list:\n");
 384
 385         list_for_each_entry(va_block, va_list, node)
 386                 dev_dbg(hdev->dev,
 387                         "va block, start: 0x%llx, end: 0x%llx, size: %llu\n",
 388                         va_block->start, va_block->end, va_block->size);
 389 #endif
 390 }
 391
 392 /*
 393  * merge_va_blocks_locked - merge a virtual block if possible
 394  *
 395  * @hdev                : pointer to the habanalabs device structure
 396  * @va_list             : pointer to the virtual addresses block list
 397  * @va_block            : virtual block to merge with adjacent blocks
 398  *
 399  * This function does the following:
 400  * - Merge the given blocks with the adjacent blocks if their virtual ranges
 401  *   create a contiguous virtual range
 402  *
 403  * This Function should be called only when va_list lock is taken
 404  */
 405 static void merge_va_blocks_locked(struct hl_device *hdev,
 406                 struct list_head *va_list, struct hl_vm_va_block *va_block)
 407 {
 408         struct hl_vm_va_block *prev, *next;
 409
 410         prev = list_prev_entry(va_block, node);
 411         if (&prev->node != va_list && prev->end + 1 == va_block->start) {
 412                 prev->end = va_block->end;
 413                 prev->size = prev->end - prev->start;
 414                 list_del(&va_block->node);
 415                 kfree(va_block);
 416                 va_block = prev;
 417         }
 418
 419         next = list_next_entry(va_block, node);
 420         if (&next->node != va_list && va_block->end + 1 == next->start) {
 421                 next->start = va_block->start;
 422                 next->size = next->end - next->start;
 423                 list_del(&va_block->node);
 424                 kfree(va_block);
 425         }
 426 }
 427
 428 /*
 429  * add_va_block_locked - add a virtual block to the virtual addresses list
 430  *
 431  * @hdev                : pointer to the habanalabs device structure
 432  * @va_list             : pointer to the virtual addresses block list
 433  * @start               : start virtual address
 434  * @end                 : end virtual address
 435  *
 436  * This function does the following:
 437  * - Add the given block to the virtual blocks list and merge with other
 438  * blocks if a contiguous virtual block can be created
 439  *
 440  * This Function should be called only when va_list lock is taken
 441  */
 442 static int add_va_block_locked(struct hl_device *hdev,
 443                 struct list_head *va_list, u64 start, u64 end)
 444 {
 445         struct hl_vm_va_block *va_block, *res = NULL;
 446         u64 size = end - start;
 447
 448         print_va_list_locked(hdev, va_list);
 449
 450         list_for_each_entry(va_block, va_list, node) {
 451                 /* TODO: remove upon matureness */
 452                 if (hl_mem_area_crosses_range(start, size, va_block->start,
 453                                 va_block->end)) {
 454                         dev_err(hdev->dev,
 455                                 "block crossing ranges at start 0x%llx, end 0x%llx\n",
 456                                 va_block->start, va_block->end);
 457                         return -EINVAL;
 458                 }
 459
 460                 if (va_block->end < start)
 461                         res = va_block;
 462         }
 463
 464         va_block = kmalloc(sizeof(*va_block), GFP_KERNEL);
 465         if (!va_block)
 466                 return -ENOMEM;
 467
 468         va_block->start = start;
 469         va_block->end = end;
 470         va_block->size = size;
 471
 472         if (!res)
 473                 list_add(&va_block->node, va_list);
 474         else
 475                 list_add(&va_block->node, &res->node);
 476
 477         merge_va_blocks_locked(hdev, va_list, va_block);
 478
 479         print_va_list_locked(hdev, va_list);
 480
 481         return 0;
 482 }
 483
 484 /*
 485  * add_va_block - wrapper for add_va_block_locked
 486  *
 487  * @hdev                : pointer to the habanalabs device structure
 488  * @va_list             : pointer to the virtual addresses block list
 489  * @start               : start virtual address
 490  * @end                 : end virtual address
 491  *
 492  * This function does the following:
 493  * - Takes the list lock and calls add_va_block_locked
 494  */
 495 static inline int add_va_block(struct hl_device *hdev,
 496                 struct hl_va_range *va_range, u64 start, u64 end)
 497 {
 498         int rc;
 499
 500         mutex_lock(&va_range->lock);
 501         rc = add_va_block_locked(hdev, &va_range->list, start, end);
 502         mutex_unlock(&va_range->lock);
 503
 504         return rc;
 505 }
 506
 507 /*
 508  * get_va_block - get a virtual block with the requested size
 509  *
 510  * @hdev            : pointer to the habanalabs device structure
 511  * @va_range        : pointer to the virtual addresses range
 512  * @size            : requested block size
 513  * @hint_addr       : hint for request address by the user
 514  * @is_userptr      : is host or DRAM memory
 515  *
 516  * This function does the following:
 517  * - Iterate on the virtual block list to find a suitable virtual block for the
 518  *   requested size
 519  * - Reserve the requested block and update the list
 520  * - Return the start address of the virtual block
 521  */
 522 static u64 get_va_block(struct hl_device *hdev,
 523                         struct hl_va_range *va_range, u64 size, u64 hint_addr,
 524                         bool is_userptr)
 525 {
 526         struct hl_vm_va_block *va_block, *new_va_block = NULL;
 527         u64 valid_start, valid_size, prev_start, prev_end, page_mask,
 528                 res_valid_start = 0, res_valid_size = 0;
 529         u32 page_size;
 530         bool add_prev = false;
 531
 532         if (is_userptr)
 533                 /*
 534                  * We cannot know if the user allocated memory with huge pages
 535                  * or not, hence we continue with the biggest possible
 536                  * granularity.
 537                  */
 538                 page_size = hdev->asic_prop.pmmu_huge.page_size;
 539         else
 540                 page_size = hdev->asic_prop.dmmu.page_size;
 541
 542         page_mask = ~((u64)page_size - 1);
 543
 544         mutex_lock(&va_range->lock);
 545
 546         print_va_list_locked(hdev, &va_range->list);
 547
 548         list_for_each_entry(va_block, &va_range->list, node) {
 549                 /* calc the first possible aligned addr */
 550                 valid_start = va_block->start;
 551
 552                 if (valid_start & (page_size - 1)) {
 553                         valid_start &= page_mask;
 554                         valid_start += page_size;
 555                         if (valid_start > va_block->end)
 556                                 continue;
 557                 }
 558
 559                 valid_size = va_block->end - valid_start;
 560
 561                 if (valid_size >= size &&
 562                         (!new_va_block || valid_size < res_valid_size)) {
 563                         new_va_block = va_block;
 564                         res_valid_start = valid_start;
 565                         res_valid_size = valid_size;
 566                 }
 567
 568                 if (hint_addr && hint_addr >= valid_start &&
 569                                 ((hint_addr + size) <= va_block->end)) {
 570                         new_va_block = va_block;
 571                         res_valid_start = hint_addr;
 572                         res_valid_size = valid_size;
 573                         break;
 574                 }
 575         }
 576
 577         if (!new_va_block) {
 578                 dev_err(hdev->dev, "no available va block for size %llu\n",
 579                                 size);
 580                 goto out;
 581         }
 582
 583         if (res_valid_start > new_va_block->start) {
 584                 prev_start = new_va_block->start;
 585                 prev_end = res_valid_start - 1;
 586
 587                 new_va_block->start = res_valid_start;
 588                 new_va_block->size = res_valid_size;
 589
 590                 add_prev = true;
 591         }
 592
 593         if (new_va_block->size > size) {
 594                 new_va_block->start += size;
 595                 new_va_block->size = new_va_block->end - new_va_block->start;
 596         } else {
 597                 list_del(&new_va_block->node);
 598                 kfree(new_va_block);
 599         }
 600
 601         if (add_prev)
 602                 add_va_block_locked(hdev, &va_range->list, prev_start,
 603                                 prev_end);
 604
 605         print_va_list_locked(hdev, &va_range->list);
 606 out:
 607         mutex_unlock(&va_range->lock);
 608
 609         return res_valid_start;
 610 }
 611
 612 /*
 613  * get_sg_info - get number of pages and the DMA address from SG list
 614  *
 615  * @sg                 : the SG list
 616  * @dma_addr           : pointer to DMA address to return
 617  *
 618  * Calculate the number of consecutive pages described by the SG list. Take the
 619  * offset of the address in the first page, add to it the length and round it up
 620  * to the number of needed pages.
 621  */
 622 static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
 623 {
 624         *dma_addr = sg_dma_address(sg);
 625
 626         return ((((*dma_addr) & (PAGE_SIZE - 1)) + sg_dma_len(sg)) +
 627                         (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 628 }
 629
 630 /*
 631  * init_phys_pg_pack_from_userptr - initialize physical page pack from host
 632  *                                  memory
 633  * @ctx: current context
 634  * @userptr: userptr to initialize from
 635  * @pphys_pg_pack: result pointer
 636  *
 637  * This function does the following:
 638  * - Pin the physical pages related to the given virtual block
 639  * - Create a physical page pack from the physical pages related to the given
 640  *   virtual block
 641  */
 642 static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 643                                 struct hl_userptr *userptr,
 644                                 struct hl_vm_phys_pg_pack **pphys_pg_pack)
 645 {
 646         struct hl_vm_phys_pg_pack *phys_pg_pack;
 647         struct scatterlist *sg;
 648         dma_addr_t dma_addr;
 649         u64 page_mask, total_npages;
 650         u32 npages, page_size = PAGE_SIZE,
 651                 huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size;
 652         bool first = true, is_huge_page_opt = true;
 653         int rc, i, j;
 654         u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);
 655
 656         phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
 657         if (!phys_pg_pack)
 658                 return -ENOMEM;
 659
 660         phys_pg_pack->vm_type = userptr->vm_type;
 661         phys_pg_pack->created_from_userptr = true;
 662         phys_pg_pack->asid = ctx->asid;
 663         atomic_set(&phys_pg_pack->mapping_cnt, 1);
 664
 665         /* Only if all dma_addrs are aligned to 2MB and their
 666          * sizes is at least 2MB, we can use huge page mapping.
 667          * We limit the 2MB optimization to this condition,
 668          * since later on we acquire the related VA range as one
 669          * consecutive block.
 670          */
 671         total_npages = 0;
 672         for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
 673                 npages = get_sg_info(sg, &dma_addr);
 674
 675                 total_npages += npages;
 676
 677                 if ((npages % pgs_in_huge_page) ||
 678                                         (dma_addr & (huge_page_size - 1)))
 679                         is_huge_page_opt = false;
 680         }
 681
 682         if (is_huge_page_opt) {
 683                 page_size = huge_page_size;
 684                 do_div(total_npages, pgs_in_huge_page);
 685         }
 686
 687         page_mask = ~(((u64) page_size) - 1);
 688
 689         phys_pg_pack->pages = kvmalloc_array(total_npages, sizeof(u64),
 690                                                 GFP_KERNEL);
 691         if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
 692                 rc = -ENOMEM;
 693                 goto page_pack_arr_mem_err;
 694         }
 695
 696         phys_pg_pack->npages = total_npages;
 697         phys_pg_pack->page_size = page_size;
 698         phys_pg_pack->total_size = total_npages * page_size;
 699
 700         j = 0;
 701         for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
 702                 npages = get_sg_info(sg, &dma_addr);
 703
 704                 /* align down to physical page size and save the offset */
 705                 if (first) {
 706                         first = false;
 707                         phys_pg_pack->offset = dma_addr & (page_size - 1);
 708                         dma_addr &= page_mask;
 709                 }
 710
 711                 while (npages) {
 712                         phys_pg_pack->pages[j++] = dma_addr;
 713                         dma_addr += page_size;
 714
 715                         if (is_huge_page_opt)
 716                                 npages -= pgs_in_huge_page;
 717                         else
 718                                 npages--;
 719                 }
 720         }
 721
 722         *pphys_pg_pack = phys_pg_pack;
 723
 724         return 0;
 725
 726 page_pack_arr_mem_err:
 727         kfree(phys_pg_pack);
 728
 729         return rc;
 730 }
 731
 732 /*
 733  * map_phys_pg_pack - maps the physical page pack.
 734  * @ctx: current context
 735  * @vaddr: start address of the virtual area to map from
 736  * @phys_pg_pack: the pack of physical pages to map to
 737  *
 738  * This function does the following:
 739  * - Maps each chunk of virtual memory to matching physical chunk
 740  * - Stores number of successful mappings in the given argument
 741  * - Returns 0 on success, error code otherwise
 742  */
 743 static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
 744                                 struct hl_vm_phys_pg_pack *phys_pg_pack)
 745 {
 746         struct hl_device *hdev = ctx->hdev;
 747         u64 next_vaddr = vaddr, paddr, mapped_pg_cnt = 0, i;
 748         u32 page_size = phys_pg_pack->page_size;
 749         int rc = 0;
 750
 751         for (i = 0 ; i < phys_pg_pack->npages ; i++) {
 752                 paddr = phys_pg_pack->pages[i];
 753
 754                 rc = hl_mmu_map(ctx, next_vaddr, paddr, page_size,
 755                                 (i + 1) == phys_pg_pack->npages);
 756                 if (rc) {
 757                         dev_err(hdev->dev,
 758                                 "map failed for handle %u, npages: %llu, mapped: %llu",
 759                                 phys_pg_pack->handle, phys_pg_pack->npages,
 760                                 mapped_pg_cnt);
 761                         goto err;
 762                 }
 763
 764                 mapped_pg_cnt++;
 765                 next_vaddr += page_size;
 766         }
 767
 768         return 0;
 769
 770 err:
 771         next_vaddr = vaddr;
 772         for (i = 0 ; i < mapped_pg_cnt ; i++) {
 773                 if (hl_mmu_unmap(ctx, next_vaddr, page_size,
 774                                         (i + 1) == mapped_pg_cnt))
 775                         dev_warn_ratelimited(hdev->dev,
 776                                 "failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n",
 777                                         phys_pg_pack->handle, next_vaddr,
 778                                         phys_pg_pack->pages[i], page_size);
 779
 780                 next_vaddr += page_size;
 781         }
 782
 783         return rc;
 784 }
 785
 786 /*
 787  * unmap_phys_pg_pack - unmaps the physical page pack
 788  * @ctx: current context
 789  * @vaddr: start address of the virtual area to unmap
 790  * @phys_pg_pack: the pack of physical pages to unmap
 791  */
 792 static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
 793                                 struct hl_vm_phys_pg_pack *phys_pg_pack)
 794 {
 795         struct hl_device *hdev = ctx->hdev;
 796         u64 next_vaddr, i;
 797         u32 page_size;
 798
 799         page_size = phys_pg_pack->page_size;
 800         next_vaddr = vaddr;
 801
 802         for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) {
 803                 if (hl_mmu_unmap(ctx, next_vaddr, page_size,
 804                                        (i + 1) == phys_pg_pack->npages))
 805                         dev_warn_ratelimited(hdev->dev,
 806                         "unmap failed for vaddr: 0x%llx\n", next_vaddr);
 807
 808                 /*
 809                  * unmapping on Palladium can be really long, so avoid a CPU
 810                  * soft lockup bug by sleeping a little between unmapping pages
 811                  */
 812                 if (hdev->pldm)
 813                         usleep_range(500, 1000);
 814         }
 815 }
 816
 817 static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args,
 818                                 u64 *paddr)
 819 {
 820         struct hl_device *hdev = ctx->hdev;
 821         struct hl_vm *vm = &hdev->vm;
 822         struct hl_vm_phys_pg_pack *phys_pg_pack;
 823         u32 handle;
 824
 825         handle = lower_32_bits(args->map_device.handle);
 826         spin_lock(&vm->idr_lock);
 827         phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
 828         if (!phys_pg_pack) {
 829                 spin_unlock(&vm->idr_lock);
 830                 dev_err(hdev->dev, "no match for handle %u\n", handle);
 831                 return -EINVAL;
 832         }
 833
 834         *paddr = phys_pg_pack->pages[0];
 835
 836         spin_unlock(&vm->idr_lock);
 837
 838         return 0;
 839 }
 840
 841 /*
 842  * map_device_va - map the given memory
 843  *
 844  * @ctx          : current context
 845  * @args         : host parameters with handle/host virtual address
 846  * @device_addr  : pointer to result device virtual address
 847  *
 848  * This function does the following:
 849  * - If given a physical device memory handle, map to a device virtual block
 850  *   and return the start address of this block
 851  * - If given a host virtual address and size, find the related physical pages,
 852  *   map a device virtual block to this pages and return the start address of
 853  *   this block
 854  */
 855 static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 856                 u64 *device_addr)
 857 {
 858         struct hl_device *hdev = ctx->hdev;
 859         struct hl_vm *vm = &hdev->vm;
 860         struct hl_vm_phys_pg_pack *phys_pg_pack;
 861         struct hl_userptr *userptr = NULL;
 862         struct hl_vm_hash_node *hnode;
 863         struct hl_va_range *va_range;
 864         enum vm_type_t *vm_type;
 865         u64 ret_vaddr, hint_addr;
 866         u32 handle = 0;
 867         int rc;
 868         bool is_userptr = args->flags & HL_MEM_USERPTR;
 869
 870         /* Assume failure */
 871         *device_addr = 0;
 872
 873         if (is_userptr) {
 874                 u64 addr = args->map_host.host_virt_addr,
 875                         size = args->map_host.mem_size;
 876
 877                 rc = dma_map_host_va(hdev, addr, size, &userptr);
 878                 if (rc) {
 879                         dev_err(hdev->dev, "failed to get userptr from va\n");
 880                         return rc;
 881                 }
 882
 883                 rc = init_phys_pg_pack_from_userptr(ctx, userptr,
 884                                 &phys_pg_pack);
 885                 if (rc) {
 886                         dev_err(hdev->dev,
 887                                 "unable to init page pack for vaddr 0x%llx\n",
 888                                 addr);
 889                         goto init_page_pack_err;
 890                 }
 891
 892                 vm_type = (enum vm_type_t *) userptr;
 893                 hint_addr = args->map_host.hint_addr;
 894                 handle = phys_pg_pack->handle;
 895         } else {
 896                 handle = lower_32_bits(args->map_device.handle);
 897
 898                 spin_lock(&vm->idr_lock);
 899                 phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, handle);
 900                 if (!phys_pg_pack) {
 901                         spin_unlock(&vm->idr_lock);
 902                         dev_err(hdev->dev,
 903                                 "no match for handle %u\n", handle);
 904                         return -EINVAL;
 905                 }
 906
 907                 /* increment now to avoid freeing device memory while mapping */
 908                 atomic_inc(&phys_pg_pack->mapping_cnt);
 909
 910                 spin_unlock(&vm->idr_lock);
 911
 912                 vm_type = (enum vm_type_t *) phys_pg_pack;
 913
 914                 hint_addr = args->map_device.hint_addr;
 915         }
 916
 917         /*
 918          * relevant for mapping device physical memory only, as host memory is
 919          * implicitly shared
 920          */
 921         if (!is_userptr && !(phys_pg_pack->flags & HL_MEM_SHARED) &&
 922                         phys_pg_pack->asid != ctx->asid) {
 923                 dev_err(hdev->dev,
 924                         "Failed to map memory, handle %u is not shared\n",
 925                         handle);
 926                 rc = -EPERM;
 927                 goto shared_err;
 928         }
 929
 930         hnode = kzalloc(sizeof(*hnode), GFP_KERNEL);
 931         if (!hnode) {
 932                 rc = -ENOMEM;
 933                 goto hnode_err;
 934         }
 935
 936         if (is_userptr)
 937                 if (phys_pg_pack->page_size == hdev->asic_prop.pmmu.page_size)
 938                         va_range = ctx->host_va_range;
 939                 else
 940                         va_range = ctx->host_huge_va_range;
 941         else
 942                 va_range = ctx->dram_va_range;
 943
 944         ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
 945                                         hint_addr, is_userptr);
 946         if (!ret_vaddr) {
 947                 dev_err(hdev->dev, "no available va block for handle %u\n",
 948                                 handle);
 949                 rc = -ENOMEM;
 950                 goto va_block_err;
 951         }
 952
 953         mutex_lock(&ctx->mmu_lock);
 954
 955         rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);
 956         if (rc) {
 957                 mutex_unlock(&ctx->mmu_lock);
 958                 dev_err(hdev->dev, "mapping page pack failed for handle %u\n",
 959                                 handle);
 960                 goto map_err;
 961         }
 962
 963         rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, false, *vm_type);
 964
 965         mutex_unlock(&ctx->mmu_lock);
 966
 967         if (rc) {
 968                 dev_err(hdev->dev,
 969                         "mapping handle %u failed due to MMU cache invalidation\n",
 970                         handle);
 971                 goto map_err;
 972         }
 973
 974         ret_vaddr += phys_pg_pack->offset;
 975
 976         hnode->ptr = vm_type;
 977         hnode->vaddr = ret_vaddr;
 978
 979         mutex_lock(&ctx->mem_hash_lock);
 980         hash_add(ctx->mem_hash, &hnode->node, ret_vaddr);
 981         mutex_unlock(&ctx->mem_hash_lock);
 982
 983         *device_addr = ret_vaddr;
 984
 985         if (is_userptr)
 986                 free_phys_pg_pack(hdev, phys_pg_pack);
 987
 988         return 0;
 989
 990 map_err:
 991         if (add_va_block(hdev, va_range, ret_vaddr,
 992                                 ret_vaddr + phys_pg_pack->total_size - 1))
 993                 dev_warn(hdev->dev,
 994                         "release va block failed for handle 0x%x, vaddr: 0x%llx\n",
 995                                 handle, ret_vaddr);
 996
 997 va_block_err:
 998         kfree(hnode);
 999 hnode_err:
1000 shared_err:
1001         atomic_dec(&phys_pg_pack->mapping_cnt);
1002         if (is_userptr)
1003                 free_phys_pg_pack(hdev, phys_pg_pack);
1004 init_page_pack_err:
1005         if (is_userptr)
1006                 dma_unmap_host_va(hdev, userptr);
1007
1008         return rc;
1009 }
1010
1011 /*
1012  * unmap_device_va      - unmap the given device virtual address
1013  *
1014  * @ctx                 : current context
1015  * @vaddr               : device virtual address to unmap
1016  * @ctx_free            : true if in context free flow, false otherwise.
1017  *
1018  * This function does the following:
1019  * - Unmap the physical pages related to the given virtual address
1020  * - return the device virtual block to the virtual block list
1021  */
1022 static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free)
1023 {
1024         struct hl_device *hdev = ctx->hdev;
1025         struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
1026         struct hl_vm_hash_node *hnode = NULL;
1027         struct hl_userptr *userptr = NULL;
1028         struct hl_va_range *va_range;
1029         enum vm_type_t *vm_type;
1030         bool is_userptr;
1031         int rc = 0;
1032
1033         /* protect from double entrance */
1034         mutex_lock(&ctx->mem_hash_lock);
1035         hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)vaddr)
1036                 if (vaddr == hnode->vaddr)
1037                         break;
1038
1039         if (!hnode) {
1040                 mutex_unlock(&ctx->mem_hash_lock);
1041                 dev_err(hdev->dev,
1042                         "unmap failed, no mem hnode for vaddr 0x%llx\n",
1043                         vaddr);
1044                 return -EINVAL;
1045         }
1046
1047         hash_del(&hnode->node);
1048         mutex_unlock(&ctx->mem_hash_lock);
1049
1050         vm_type = hnode->ptr;
1051
1052         if (*vm_type == VM_TYPE_USERPTR) {
1053                 is_userptr = true;
1054                 userptr = hnode->ptr;
1055                 rc = init_phys_pg_pack_from_userptr(ctx, userptr,
1056                                                         &phys_pg_pack);
1057                 if (rc) {
1058                         dev_err(hdev->dev,
1059                                 "unable to init page pack for vaddr 0x%llx\n",
1060                                 vaddr);
1061                         goto vm_type_err;
1062                 }
1063
1064                 if (phys_pg_pack->page_size ==
1065                                         hdev->asic_prop.pmmu.page_size)
1066                         va_range = ctx->host_va_range;
1067                 else
1068                         va_range = ctx->host_huge_va_range;
1069         } else if (*vm_type == VM_TYPE_PHYS_PACK) {
1070                 is_userptr = false;
1071                 va_range = ctx->dram_va_range;
1072                 phys_pg_pack = hnode->ptr;
1073         } else {
1074                 dev_warn(hdev->dev,
1075                         "unmap failed, unknown vm desc for vaddr 0x%llx\n",
1076                                 vaddr);
1077                 rc = -EFAULT;
1078                 goto vm_type_err;
1079         }
1080
1081         if (atomic_read(&phys_pg_pack->mapping_cnt) == 0) {
1082                 dev_err(hdev->dev, "vaddr 0x%llx is not mapped\n", vaddr);
1083                 rc = -EINVAL;
1084                 goto mapping_cnt_err;
1085         }
1086
1087         vaddr &= ~(((u64) phys_pg_pack->page_size) - 1);
1088
1089         mutex_lock(&ctx->mmu_lock);
1090
1091         unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack);
1092
1093         /*
1094          * During context free this function is called in a loop to clean all
1095          * the context mappings. Hence the cache invalidation can be called once
1096          * at the loop end rather than for each iteration
1097          */
1098         if (!ctx_free)
1099                 rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, true,
1100                                                                 *vm_type);
1101
1102         mutex_unlock(&ctx->mmu_lock);
1103
1104         /*
1105          * If the context is closing we don't need to check for the MMU cache
1106          * invalidation return code and update the VA free list as in this flow
1107          * we invalidate the MMU cache outside of this unmap function and the VA
1108          * free list will be freed anyway.
1109          */
1110         if (!ctx_free) {
1111                 int tmp_rc;
1112
1113                 if (rc)
1114                         dev_err(hdev->dev,
1115                                 "unmapping vaddr 0x%llx failed due to MMU cache invalidation\n",
1116                                 vaddr);
1117
1118                 tmp_rc = add_va_block(hdev, va_range, vaddr,
1119                                         vaddr + phys_pg_pack->total_size - 1);
1120                 if (tmp_rc) {
1121                         dev_warn(hdev->dev,
1122                                         "add va block failed for vaddr: 0x%llx\n",
1123                                         vaddr);
1124                         if (!rc)
1125                                 rc = tmp_rc;
1126                 }
1127         }
1128
1129         atomic_dec(&phys_pg_pack->mapping_cnt);
1130         kfree(hnode);
1131
1132         if (is_userptr) {
1133                 free_phys_pg_pack(hdev, phys_pg_pack);
1134                 dma_unmap_host_va(hdev, userptr);
1135         }
1136
1137         return rc;
1138
1139 mapping_cnt_err:
1140         if (is_userptr)
1141                 free_phys_pg_pack(hdev, phys_pg_pack);
1142 vm_type_err:
1143         mutex_lock(&ctx->mem_hash_lock);
1144         hash_add(ctx->mem_hash, &hnode->node, vaddr);
1145         mutex_unlock(&ctx->mem_hash_lock);
1146
1147         return rc;
1148 }
1149
1150 static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
1151 {
1152         struct hl_device *hdev = hpriv->hdev;
1153         struct hl_ctx *ctx = hpriv->ctx;
1154         u64 device_addr = 0;
1155         u32 handle = 0;
1156         int rc;
1157
1158         switch (args->in.op) {
1159         case HL_MEM_OP_ALLOC:
1160                 if (args->in.alloc.mem_size == 0) {
1161                         dev_err(hdev->dev,
1162                                 "alloc size must be larger than 0\n");
1163                         rc = -EINVAL;
1164                         goto out;
1165                 }
1166
1167                 /* Force contiguous as there are no real MMU
1168                  * translations to overcome physical memory gaps
1169                  */
1170                 args->in.flags |= HL_MEM_CONTIGUOUS;
1171                 rc = alloc_device_memory(ctx, &args->in, &handle);
1172
1173                 memset(args, 0, sizeof(*args));
1174                 args->out.handle = (__u64) handle;
1175                 break;
1176
1177         case HL_MEM_OP_FREE:
1178                 rc = free_device_memory(ctx, args->in.free.handle);
1179                 break;
1180
1181         case HL_MEM_OP_MAP:
1182                 if (args->in.flags & HL_MEM_USERPTR) {
1183                         device_addr = args->in.map_host.host_virt_addr;
1184                         rc = 0;
1185                 } else {
1186                         rc = get_paddr_from_handle(ctx, &args->in,
1187                                         &device_addr);
1188                 }
1189
1190                 memset(args, 0, sizeof(*args));
1191                 args->out.device_virt_addr = device_addr;
1192                 break;
1193
1194         case HL_MEM_OP_UNMAP:
1195                 rc = 0;
1196                 break;
1197
1198         default:
1199                 dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
1200                 rc = -ENOTTY;
1201                 break;
1202         }
1203
1204 out:
1205         return rc;
1206 }
1207
1208 int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
1209 {
1210         union hl_mem_args *args = data;
1211         struct hl_device *hdev = hpriv->hdev;
1212         struct hl_ctx *ctx = hpriv->ctx;
1213         u64 device_addr = 0;
1214         u32 handle = 0;
1215         int rc;
1216
1217         if (hl_device_disabled_or_in_reset(hdev)) {
1218                 dev_warn_ratelimited(hdev->dev,
1219                         "Device is %s. Can't execute MEMORY IOCTL\n",
1220                         atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
1221                 return -EBUSY;
1222         }
1223
1224         if (!hdev->mmu_enable)
1225                 return mem_ioctl_no_mmu(hpriv, args);
1226
1227         switch (args->in.op) {
1228         case HL_MEM_OP_ALLOC:
1229                 if (!hdev->dram_supports_virtual_memory) {
1230                         dev_err(hdev->dev, "DRAM alloc is not supported\n");
1231                         rc = -EINVAL;
1232                         goto out;
1233                 }
1234
1235                 if (args->in.alloc.mem_size == 0) {
1236                         dev_err(hdev->dev,
1237                                 "alloc size must be larger than 0\n");
1238                         rc = -EINVAL;
1239                         goto out;
1240                 }
1241                 rc = alloc_device_memory(ctx, &args->in, &handle);
1242
1243                 memset(args, 0, sizeof(*args));
1244                 args->out.handle = (__u64) handle;
1245                 break;
1246
1247         case HL_MEM_OP_FREE:
1248                 rc = free_device_memory(ctx, args->in.free.handle);
1249                 break;
1250
1251         case HL_MEM_OP_MAP:
1252                 rc = map_device_va(ctx, &args->in, &device_addr);
1253
1254                 memset(args, 0, sizeof(*args));
1255                 args->out.device_virt_addr = device_addr;
1256                 break;
1257
1258         case HL_MEM_OP_UNMAP:
1259                 rc = unmap_device_va(ctx, args->in.unmap.device_virt_addr,
1260                                         false);
1261                 break;
1262
1263         default:
1264                 dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
1265                 rc = -ENOTTY;
1266                 break;
1267         }
1268
1269 out:
1270         return rc;
1271 }
1272
1273 static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size,
1274                                 u32 npages, u64 start, u32 offset,
1275                                 struct hl_userptr *userptr)
1276 {
1277         int rc;
1278
1279         if (!access_ok((void __user *) (uintptr_t) addr, size)) {
1280                 dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr);
1281                 return -EFAULT;
1282         }
1283
1284         userptr->vec = frame_vector_create(npages);
1285         if (!userptr->vec) {
1286                 dev_err(hdev->dev, "Failed to create frame vector\n");
1287                 return -ENOMEM;
1288         }
1289
1290         rc = get_vaddr_frames(start, npages, FOLL_FORCE | FOLL_WRITE,
1291                                 userptr->vec);
1292
1293         if (rc != npages) {
1294                 dev_err(hdev->dev,
1295                         "Failed to map host memory, user ptr probably wrong\n");
1296                 if (rc < 0)
1297                         goto destroy_framevec;
1298                 rc = -EFAULT;
1299                 goto put_framevec;
1300         }
1301
1302         if (frame_vector_to_pages(userptr->vec) < 0) {
1303                 dev_err(hdev->dev,
1304                         "Failed to translate frame vector to pages\n");
1305                 rc = -EFAULT;
1306                 goto put_framevec;
1307         }
1308
1309         rc = sg_alloc_table_from_pages(userptr->sgt,
1310                                         frame_vector_pages(userptr->vec),
1311                                         npages, offset, size, GFP_ATOMIC);
1312         if (rc < 0) {
1313                 dev_err(hdev->dev, "failed to create SG table from pages\n");
1314                 goto put_framevec;
1315         }
1316
1317         return 0;
1318
1319 put_framevec:
1320         put_vaddr_frames(userptr->vec);
1321 destroy_framevec:
1322         frame_vector_destroy(userptr->vec);
1323         return rc;
1324 }
1325
1326 /*
1327  * hl_pin_host_memory - pins a chunk of host memory.
1328  * @hdev: pointer to the habanalabs device structure
1329  * @addr: the host virtual address of the memory area
1330  * @size: the size of the memory area
1331  * @userptr: pointer to hl_userptr structure
1332  *
1333  * This function does the following:
1334  * - Pins the physical pages
1335  * - Create an SG list from those pages
1336  */
1337 int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
1338                                         struct hl_userptr *userptr)
1339 {
1340         u64 start, end;
1341         u32 npages, offset;
1342         int rc;
1343
1344         if (!size) {
1345                 dev_err(hdev->dev, "size to pin is invalid - %llu\n", size);
1346                 return -EINVAL;
1347         }
1348
1349         /*
1350          * If the combination of the address and size requested for this memory
1351          * region causes an integer overflow, return error.
1352          */
1353         if (((addr + size) < addr) ||
1354                         PAGE_ALIGN(addr + size) < (addr + size)) {
1355                 dev_err(hdev->dev,
1356                         "user pointer 0x%llx + %llu causes integer overflow\n",
1357                         addr, size);
1358                 return -EINVAL;
1359         }
1360
1361         /*
1362          * This function can be called also from data path, hence use atomic
1363          * always as it is not a big allocation.
1364          */
1365         userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_ATOMIC);
1366         if (!userptr->sgt)
1367                 return -ENOMEM;
1368
1369         start = addr & PAGE_MASK;
1370         offset = addr & ~PAGE_MASK;
1371         end = PAGE_ALIGN(addr + size);
1372         npages = (end - start) >> PAGE_SHIFT;
1373
1374         userptr->size = size;
1375         userptr->addr = addr;
1376         userptr->dma_mapped = false;
1377         INIT_LIST_HEAD(&userptr->job_node);
1378
1379         rc = get_user_memory(hdev, addr, size, npages, start, offset,
1380                                 userptr);
1381         if (rc) {
1382                 dev_err(hdev->dev,
1383                         "failed to get user memory for address 0x%llx\n",
1384                         addr);
1385                 goto free_sgt;
1386         }
1387
1388         hl_debugfs_add_userptr(hdev, userptr);
1389
1390         return 0;
1391
1392 free_sgt:
1393         kfree(userptr->sgt);
1394         return rc;
1395 }
1396
1397 /*
1398  * hl_unpin_host_memory - unpins a chunk of host memory.
1399  * @hdev: pointer to the habanalabs device structure
1400  * @userptr: pointer to hl_userptr structure
1401  *
1402  * This function does the following:
1403  * - Unpins the physical pages related to the host memory
1404  * - Free the SG list
1405  */
1406 void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
1407 {
1408         struct page **pages;
1409
1410         hl_debugfs_remove_userptr(hdev, userptr);
1411
1412         if (userptr->dma_mapped)
1413                 hdev->asic_funcs->hl_dma_unmap_sg(hdev, userptr->sgt->sgl,
1414                                                         userptr->sgt->nents,
1415                                                         userptr->dir);
1416
1417         pages = frame_vector_pages(userptr->vec);
1418         if (!IS_ERR(pages)) {
1419                 int i;
1420
1421                 for (i = 0; i < frame_vector_count(userptr->vec); i++)
1422                         set_page_dirty_lock(pages[i]);
1423         }
1424         put_vaddr_frames(userptr->vec);
1425         frame_vector_destroy(userptr->vec);
1426
1427         list_del(&userptr->job_node);
1428
1429         sg_free_table(userptr->sgt);
1430         kfree(userptr->sgt);
1431 }
1432
1433 /*
1434  * hl_userptr_delete_list - clear userptr list
1435  *
1436  * @hdev                : pointer to the habanalabs device structure
1437  * @userptr_list        : pointer to the list to clear
1438  *
1439  * This function does the following:
1440  * - Iterates over the list and unpins the host memory and frees the userptr
1441  *   structure.
1442  */
1443 void hl_userptr_delete_list(struct hl_device *hdev,
1444                                 struct list_head *userptr_list)
1445 {
1446         struct hl_userptr *userptr, *tmp;
1447
1448         list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) {
1449                 hl_unpin_host_memory(hdev, userptr);
1450                 kfree(userptr);
1451         }
1452
1453         INIT_LIST_HEAD(userptr_list);
1454 }
1455
1456 /*
1457  * hl_userptr_is_pinned - returns whether the given userptr is pinned
1458  *
1459  * @hdev                : pointer to the habanalabs device structure
1460  * @userptr_list        : pointer to the list to clear
1461  * @userptr             : pointer to userptr to check
1462  *
1463  * This function does the following:
1464  * - Iterates over the list and checks if the given userptr is in it, means is
1465  *   pinned. If so, returns true, otherwise returns false.
1466  */
1467 bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,
1468                                 u32 size, struct list_head *userptr_list,
1469                                 struct hl_userptr **userptr)
1470 {
1471         list_for_each_entry((*userptr), userptr_list, job_node) {
1472                 if ((addr == (*userptr)->addr) && (size == (*userptr)->size))
1473                         return true;
1474         }
1475
1476         return false;
1477 }
1478
1479 /*
1480  * va_range_init - initialize virtual addresses range
1481  * @hdev: pointer to the habanalabs device structure
1482  * @va_range: pointer to the range to initialize
1483  * @start: range start address
1484  * @end: range end address
1485  *
1486  * This function does the following:
1487  * - Initializes the virtual addresses list of the given range with the given
1488  *   addresses.
1489  */
1490 static int va_range_init(struct hl_device *hdev, struct hl_va_range *va_range,
1491                                 u64 start, u64 end)
1492 {
1493         int rc;
1494
1495         INIT_LIST_HEAD(&va_range->list);
1496
1497         /* PAGE_SIZE alignment */
1498
1499         if (start & (PAGE_SIZE - 1)) {
1500                 start &= PAGE_MASK;
1501                 start += PAGE_SIZE;
1502         }
1503
1504         if (end & (PAGE_SIZE - 1))
1505                 end &= PAGE_MASK;
1506
1507         if (start >= end) {
1508                 dev_err(hdev->dev, "too small vm range for va list\n");
1509                 return -EFAULT;
1510         }
1511
1512         rc = add_va_block(hdev, va_range, start, end);
1513
1514         if (rc) {
1515                 dev_err(hdev->dev, "Failed to init host va list\n");
1516                 return rc;
1517         }
1518
1519         va_range->start_addr = start;
1520         va_range->end_addr = end;
1521
1522         return 0;
1523 }
1524
1525 /*
1526  * va_range_fini() - clear a virtual addresses range
1527  * @hdev: pointer to the habanalabs structure
1528  * va_range: pointer to virtual addresses range
1529  *
1530  * This function does the following:
1531  * - Frees the virtual addresses block list and its lock
1532  */
1533 static void va_range_fini(struct hl_device *hdev,
1534                 struct hl_va_range *va_range)
1535 {
1536         mutex_lock(&va_range->lock);
1537         clear_va_list_locked(hdev, &va_range->list);
1538         mutex_unlock(&va_range->lock);
1539
1540         mutex_destroy(&va_range->lock);
1541         kfree(va_range);
1542 }
1543
1544 /*
1545  * vm_ctx_init_with_ranges() - initialize virtual memory for context
1546  * @ctx: pointer to the habanalabs context structure
1547  * @host_range_start: host virtual addresses range start.
1548  * @host_range_end: host virtual addresses range end.
1549  * @host_huge_range_start: host virtual addresses range start for memory
1550  *                          allocated with huge pages.
1551  * @host_huge_range_end: host virtual addresses range end for memory allocated
1552  *                        with huge pages.
1553  * @dram_range_start: dram virtual addresses range start.
1554  * @dram_range_end: dram virtual addresses range end.
1555  *
1556  * This function initializes the following:
1557  * - MMU for context
1558  * - Virtual address to area descriptor hashtable
1559  * - Virtual block list of available virtual memory
1560  */
1561 static int vm_ctx_init_with_ranges(struct hl_ctx *ctx,
1562                                         u64 host_range_start,
1563                                         u64 host_range_end,
1564                                         u64 host_huge_range_start,
1565                                         u64 host_huge_range_end,
1566                                         u64 dram_range_start,
1567                                         u64 dram_range_end)
1568 {
1569         struct hl_device *hdev = ctx->hdev;
1570         int rc;
1571
1572         ctx->host_va_range = kzalloc(sizeof(*ctx->host_va_range), GFP_KERNEL);
1573         if (!ctx->host_va_range)
1574                 return -ENOMEM;
1575
1576         ctx->host_huge_va_range = kzalloc(sizeof(*ctx->host_huge_va_range),
1577                                                 GFP_KERNEL);
1578         if (!ctx->host_huge_va_range) {
1579                 rc =  -ENOMEM;
1580                 goto host_huge_va_range_err;
1581         }
1582
1583         ctx->dram_va_range = kzalloc(sizeof(*ctx->dram_va_range), GFP_KERNEL);
1584         if (!ctx->dram_va_range) {
1585                 rc = -ENOMEM;
1586                 goto dram_va_range_err;
1587         }
1588
1589         rc = hl_mmu_ctx_init(ctx);
1590         if (rc) {
1591                 dev_err(hdev->dev, "failed to init context %d\n", ctx->asid);
1592                 goto mmu_ctx_err;
1593         }
1594
1595         mutex_init(&ctx->mem_hash_lock);
1596         hash_init(ctx->mem_hash);
1597
1598         mutex_init(&ctx->host_va_range->lock);
1599
1600         rc = va_range_init(hdev, ctx->host_va_range, host_range_start,
1601                                 host_range_end);
1602         if (rc) {
1603                 dev_err(hdev->dev, "failed to init host vm range\n");
1604                 goto host_page_range_err;
1605         }
1606
1607         if (hdev->pmmu_huge_range) {
1608                 mutex_init(&ctx->host_huge_va_range->lock);
1609
1610                 rc = va_range_init(hdev, ctx->host_huge_va_range,
1611                                         host_huge_range_start,
1612                                         host_huge_range_end);
1613                 if (rc) {
1614                         dev_err(hdev->dev,
1615                                 "failed to init host huge vm range\n");
1616                         goto host_hpage_range_err;
1617                 }
1618         } else {
1619                 ctx->host_huge_va_range = ctx->host_va_range;
1620         }
1621
1622         mutex_init(&ctx->dram_va_range->lock);
1623
1624         rc = va_range_init(hdev, ctx->dram_va_range, dram_range_start,
1625                         dram_range_end);
1626         if (rc) {
1627                 dev_err(hdev->dev, "failed to init dram vm range\n");
1628                 goto dram_vm_err;
1629         }
1630
1631         hl_debugfs_add_ctx_mem_hash(hdev, ctx);
1632
1633         return 0;
1634
1635 dram_vm_err:
1636         mutex_destroy(&ctx->dram_va_range->lock);
1637
1638         if (hdev->pmmu_huge_range) {
1639                 mutex_lock(&ctx->host_huge_va_range->lock);
1640                 clear_va_list_locked(hdev, &ctx->host_huge_va_range->list);
1641                 mutex_unlock(&ctx->host_huge_va_range->lock);
1642         }
1643 host_hpage_range_err:
1644         if (hdev->pmmu_huge_range)
1645                 mutex_destroy(&ctx->host_huge_va_range->lock);
1646         mutex_lock(&ctx->host_va_range->lock);
1647         clear_va_list_locked(hdev, &ctx->host_va_range->list);
1648         mutex_unlock(&ctx->host_va_range->lock);
1649 host_page_range_err:
1650         mutex_destroy(&ctx->host_va_range->lock);
1651         mutex_destroy(&ctx->mem_hash_lock);
1652         hl_mmu_ctx_fini(ctx);
1653 mmu_ctx_err:
1654         kfree(ctx->dram_va_range);
1655 dram_va_range_err:
1656         kfree(ctx->host_huge_va_range);
1657 host_huge_va_range_err:
1658         kfree(ctx->host_va_range);
1659
1660         return rc;
1661 }
1662
1663 int hl_vm_ctx_init(struct hl_ctx *ctx)
1664 {
1665         struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
1666         u64 host_range_start, host_range_end, host_huge_range_start,
1667                 host_huge_range_end, dram_range_start, dram_range_end;
1668
1669         atomic64_set(&ctx->dram_phys_mem, 0);
1670
1671         /*
1672          * - If MMU is enabled, init the ranges as usual.
1673          * - If MMU is disabled, in case of host mapping, the returned address
1674          *   is the given one.
1675          *   In case of DRAM mapping, the returned address is the physical
1676          *   address of the memory related to the given handle.
1677          */
1678         if (ctx->hdev->mmu_enable) {
1679                 dram_range_start = prop->dmmu.start_addr;
1680                 dram_range_end = prop->dmmu.end_addr;
1681                 host_range_start = prop->pmmu.start_addr;
1682                 host_range_end = prop->pmmu.end_addr;
1683                 host_huge_range_start = prop->pmmu_huge.start_addr;
1684                 host_huge_range_end = prop->pmmu_huge.end_addr;
1685         } else {
1686                 dram_range_start = prop->dram_user_base_address;
1687                 dram_range_end = prop->dram_end_address;
1688                 host_range_start = prop->dram_user_base_address;
1689                 host_range_end = prop->dram_end_address;
1690                 host_huge_range_start = prop->dram_user_base_address;
1691                 host_huge_range_end = prop->dram_end_address;
1692         }
1693
1694         return vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end,
1695                                         host_huge_range_start,
1696                                         host_huge_range_end,
1697                                         dram_range_start,
1698                                         dram_range_end);
1699 }
1700
1701 /*
1702  * hl_vm_ctx_fini       - virtual memory teardown of context
1703  *
1704  * @ctx                 : pointer to the habanalabs context structure
1705  *
1706  * This function perform teardown the following:
1707  * - Virtual block list of available virtual memory
1708  * - Virtual address to area descriptor hashtable
1709  * - MMU for context
1710  *
1711  * In addition this function does the following:
1712  * - Unmaps the existing hashtable nodes if the hashtable is not empty. The
1713  *   hashtable should be empty as no valid mappings should exist at this
1714  *   point.
1715  * - Frees any existing physical page list from the idr which relates to the
1716  *   current context asid.
1717  * - This function checks the virtual block list for correctness. At this point
1718  *   the list should contain one element which describes the whole virtual
1719  *   memory range of the context. Otherwise, a warning is printed.
1720  */
1721 void hl_vm_ctx_fini(struct hl_ctx *ctx)
1722 {
1723         struct hl_device *hdev = ctx->hdev;
1724         struct hl_vm *vm = &hdev->vm;
1725         struct hl_vm_phys_pg_pack *phys_pg_list;
1726         struct hl_vm_hash_node *hnode;
1727         struct hlist_node *tmp_node;
1728         int i;
1729
1730         hl_debugfs_remove_ctx_mem_hash(hdev, ctx);
1731
1732         /*
1733          * Clearly something went wrong on hard reset so no point in printing
1734          * another side effect error
1735          */
1736         if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash))
1737                 dev_notice(hdev->dev,
1738                         "user released device without removing its memory mappings\n");
1739
1740         hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {
1741                 dev_dbg(hdev->dev,
1742                         "hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n",
1743                         hnode->vaddr, ctx->asid);
1744                 unmap_device_va(ctx, hnode->vaddr, true);
1745         }
1746
1747         /* invalidate the cache once after the unmapping loop */
1748         hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
1749         hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_PHYS_PACK);
1750
1751         spin_lock(&vm->idr_lock);
1752         idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i)
1753                 if (phys_pg_list->asid == ctx->asid) {
1754                         dev_dbg(hdev->dev,
1755                                 "page list 0x%px of asid %d is still alive\n",
1756                                 phys_pg_list, ctx->asid);
1757                         atomic64_sub(phys_pg_list->total_size,
1758                                         &hdev->dram_used_mem);
1759                         free_phys_pg_pack(hdev, phys_pg_list);
1760                         idr_remove(&vm->phys_pg_pack_handles, i);
1761                 }
1762         spin_unlock(&vm->idr_lock);
1763
1764         va_range_fini(hdev, ctx->dram_va_range);
1765         if (hdev->pmmu_huge_range)
1766                 va_range_fini(hdev, ctx->host_huge_va_range);
1767         va_range_fini(hdev, ctx->host_va_range);
1768
1769         mutex_destroy(&ctx->mem_hash_lock);
1770         hl_mmu_ctx_fini(ctx);
1771 }
1772
1773 /*
1774  * hl_vm_init           - initialize virtual memory module
1775  *
1776  * @hdev                : pointer to the habanalabs device structure
1777  *
1778  * This function initializes the following:
1779  * - MMU module
1780  * - DRAM physical pages pool of 2MB
1781  * - Idr for device memory allocation handles
1782  */
1783 int hl_vm_init(struct hl_device *hdev)
1784 {
1785         struct asic_fixed_properties *prop = &hdev->asic_prop;
1786         struct hl_vm *vm = &hdev->vm;
1787         int rc;
1788
1789         vm->dram_pg_pool = gen_pool_create(__ffs(prop->dram_page_size), -1);
1790         if (!vm->dram_pg_pool) {
1791                 dev_err(hdev->dev, "Failed to create dram page pool\n");
1792                 return -ENOMEM;
1793         }
1794
1795         kref_init(&vm->dram_pg_pool_refcount);
1796
1797         rc = gen_pool_add(vm->dram_pg_pool, prop->dram_user_base_address,
1798                         prop->dram_end_address - prop->dram_user_base_address,
1799                         -1);
1800
1801         if (rc) {
1802                 dev_err(hdev->dev,
1803                         "Failed to add memory to dram page pool %d\n", rc);
1804                 goto pool_add_err;
1805         }
1806
1807         spin_lock_init(&vm->idr_lock);
1808         idr_init(&vm->phys_pg_pack_handles);
1809
1810         atomic64_set(&hdev->dram_used_mem, 0);
1811
1812         vm->init_done = true;
1813
1814         return 0;
1815
1816 pool_add_err:
1817         gen_pool_destroy(vm->dram_pg_pool);
1818
1819         return rc;
1820 }
1821
1822 /*
1823  * hl_vm_fini           - virtual memory module teardown
1824  *
1825  * @hdev                : pointer to the habanalabs device structure
1826  *
1827  * This function perform teardown to the following:
1828  * - Idr for device memory allocation handles
1829  * - DRAM physical pages pool of 2MB
1830  * - MMU module
1831  */
1832 void hl_vm_fini(struct hl_device *hdev)
1833 {
1834         struct hl_vm *vm = &hdev->vm;
1835
1836         if (!vm->init_done)
1837                 return;
1838
1839         /*
1840          * At this point all the contexts should be freed and hence no DRAM
1841          * memory should be in use. Hence the DRAM pool should be freed here.
1842          */
1843         if (kref_put(&vm->dram_pg_pool_refcount, dram_pg_pool_do_release) != 1)
1844                 dev_warn(hdev->dev, "dram_pg_pool was not destroyed on %s\n",
1845                                 __func__);
1846
1847         vm->init_done = false;
1848 }