2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem.h>
43 #include <rdma/ib_umem_odp.h>
44 #include <rdma/ib_verbs.h>
50 MAX_PENDING_REG_MR = 8,
53 #define MLX5_UMR_ALIGN 2048
56 create_mkey_callback(int status, struct mlx5_async_work *context);
57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
58 u64 iova, int access_flags,
59 unsigned int page_size, bool populate);
61 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
64 struct mlx5_ib_dev *dev = to_mdev(pd->device);
66 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
67 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
68 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
69 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
70 MLX5_SET(mkc, mkc, lr, 1);
72 if ((acc & IB_ACCESS_RELAXED_ORDERING) &&
73 pcie_relaxed_ordering_enabled(dev->mdev->pdev)) {
74 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
75 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
76 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
77 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
80 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
81 MLX5_SET(mkc, mkc, qpn, 0xffffff);
82 MLX5_SET64(mkc, mkc, start_addr, start_addr);
85 static void assign_mkey_variant(struct mlx5_ib_dev *dev,
86 struct mlx5_ib_mkey *mkey, u32 *in)
88 u8 key = atomic_inc_return(&dev->mkey_var);
91 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
92 MLX5_SET(mkc, mkc, mkey_7_0, key);
96 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
97 struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
101 assign_mkey_variant(dev, mkey, in);
102 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
104 init_waitqueue_head(&mkey->wait);
110 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
111 struct mlx5_ib_mkey *mkey,
112 struct mlx5_async_ctx *async_ctx,
113 u32 *in, int inlen, u32 *out, int outlen,
114 struct mlx5_async_work *context)
116 MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
117 assign_mkey_variant(dev, mkey, in);
118 return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
119 create_mkey_callback, context);
122 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
123 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
125 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
127 return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
130 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
132 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
134 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
137 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
139 if (status == -ENXIO) /* core driver is not available */
142 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
143 if (status != -EREMOTEIO) /* driver specific failure */
146 /* Failed in FW, print cmd out failure details */
147 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
150 static void create_mkey_callback(int status, struct mlx5_async_work *context)
152 struct mlx5_ib_mr *mr =
153 container_of(context, struct mlx5_ib_mr, cb_work);
154 struct mlx5_cache_ent *ent = mr->cache_ent;
155 struct mlx5_ib_dev *dev = ent->dev;
159 create_mkey_warn(dev, status, mr->out);
161 spin_lock_irqsave(&ent->lock, flags);
163 WRITE_ONCE(dev->fill_delay, 1);
164 spin_unlock_irqrestore(&ent->lock, flags);
165 mod_timer(&dev->delay_timer, jiffies + HZ);
169 mr->mmkey.type = MLX5_MKEY_MR;
170 mr->mmkey.key |= mlx5_idx_to_mkey(
171 MLX5_GET(create_mkey_out, mr->out, mkey_index));
172 init_waitqueue_head(&mr->mmkey.wait);
174 WRITE_ONCE(dev->cache.last_add, jiffies);
176 spin_lock_irqsave(&ent->lock, flags);
177 list_add_tail(&mr->list, &ent->head);
178 ent->available_mrs++;
180 /* If we are doing fill_to_high_water then keep going. */
181 queue_adjust_cache_locked(ent);
183 spin_unlock_irqrestore(&ent->lock, flags);
186 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
190 switch (access_mode) {
191 case MLX5_MKC_ACCESS_MODE_MTT:
192 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
193 sizeof(struct mlx5_mtt));
195 case MLX5_MKC_ACCESS_MODE_KSM:
196 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
197 sizeof(struct mlx5_klm));
205 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
207 struct mlx5_ib_mr *mr;
209 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
214 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
215 MLX5_SET(mkc, mkc, free, 1);
216 MLX5_SET(mkc, mkc, umr_en, 1);
217 MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
218 MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
220 MLX5_SET(mkc, mkc, translations_octword_size,
221 get_mkc_octo_size(ent->access_mode, ent->ndescs));
222 MLX5_SET(mkc, mkc, log_page_size, ent->page);
226 /* Asynchronously schedule new MRs to be populated in the cache. */
227 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
229 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
230 struct mlx5_ib_mr *mr;
236 in = kzalloc(inlen, GFP_KERNEL);
240 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
241 for (i = 0; i < num; i++) {
242 mr = alloc_cache_mr(ent, mkc);
247 spin_lock_irq(&ent->lock);
248 if (ent->pending >= MAX_PENDING_REG_MR) {
250 spin_unlock_irq(&ent->lock);
255 spin_unlock_irq(&ent->lock);
256 err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
257 &ent->dev->async_ctx, in, inlen,
258 mr->out, sizeof(mr->out),
261 spin_lock_irq(&ent->lock);
263 spin_unlock_irq(&ent->lock);
264 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
274 /* Synchronously create a MR in the cache */
275 static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
277 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
278 struct mlx5_ib_mr *mr;
283 in = kzalloc(inlen, GFP_KERNEL);
285 return ERR_PTR(-ENOMEM);
286 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
288 mr = alloc_cache_mr(ent, mkc);
294 err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen);
298 init_waitqueue_head(&mr->mmkey.wait);
299 mr->mmkey.type = MLX5_MKEY_MR;
300 WRITE_ONCE(ent->dev->cache.last_add, jiffies);
301 spin_lock_irq(&ent->lock);
303 spin_unlock_irq(&ent->lock);
313 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
315 struct mlx5_ib_mr *mr;
317 lockdep_assert_held(&ent->lock);
318 if (list_empty(&ent->head))
320 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
322 ent->available_mrs--;
324 spin_unlock_irq(&ent->lock);
325 mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key);
327 spin_lock_irq(&ent->lock);
330 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
335 lockdep_assert_held(&ent->lock);
339 target = ent->limit * 2;
340 if (target == ent->available_mrs + ent->pending)
342 if (target > ent->available_mrs + ent->pending) {
343 u32 todo = target - (ent->available_mrs + ent->pending);
345 spin_unlock_irq(&ent->lock);
346 err = add_keys(ent, todo);
348 usleep_range(3000, 5000);
349 spin_lock_irq(&ent->lock);
356 remove_cache_mr_locked(ent);
361 static ssize_t size_write(struct file *filp, const char __user *buf,
362 size_t count, loff_t *pos)
364 struct mlx5_cache_ent *ent = filp->private_data;
368 err = kstrtou32_from_user(buf, count, 0, &target);
373 * Target is the new value of total_mrs the user requests, however we
374 * cannot free MRs that are in use. Compute the target value for
377 spin_lock_irq(&ent->lock);
378 if (target < ent->total_mrs - ent->available_mrs) {
382 target = target - (ent->total_mrs - ent->available_mrs);
383 if (target < ent->limit || target > ent->limit*2) {
387 err = resize_available_mrs(ent, target, false);
390 spin_unlock_irq(&ent->lock);
395 spin_unlock_irq(&ent->lock);
399 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
402 struct mlx5_cache_ent *ent = filp->private_data;
406 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
410 return simple_read_from_buffer(buf, count, pos, lbuf, err);
413 static const struct file_operations size_fops = {
414 .owner = THIS_MODULE,
420 static ssize_t limit_write(struct file *filp, const char __user *buf,
421 size_t count, loff_t *pos)
423 struct mlx5_cache_ent *ent = filp->private_data;
427 err = kstrtou32_from_user(buf, count, 0, &var);
432 * Upon set we immediately fill the cache to high water mark implied by
435 spin_lock_irq(&ent->lock);
437 err = resize_available_mrs(ent, 0, true);
438 spin_unlock_irq(&ent->lock);
444 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
447 struct mlx5_cache_ent *ent = filp->private_data;
451 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
455 return simple_read_from_buffer(buf, count, pos, lbuf, err);
458 static const struct file_operations limit_fops = {
459 .owner = THIS_MODULE,
461 .write = limit_write,
465 static bool someone_adding(struct mlx5_mr_cache *cache)
469 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
470 struct mlx5_cache_ent *ent = &cache->ent[i];
473 spin_lock_irq(&ent->lock);
474 ret = ent->available_mrs < ent->limit;
475 spin_unlock_irq(&ent->lock);
483 * Check if the bucket is outside the high/low water mark and schedule an async
484 * update. The cache refill has hysteresis, once the low water mark is hit it is
485 * refilled up to the high mark.
487 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
489 lockdep_assert_held(&ent->lock);
491 if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
493 if (ent->available_mrs < ent->limit) {
494 ent->fill_to_high_water = true;
495 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
496 } else if (ent->fill_to_high_water &&
497 ent->available_mrs + ent->pending < 2 * ent->limit) {
499 * Once we start populating due to hitting a low water mark
500 * continue until we pass the high water mark.
502 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
503 } else if (ent->available_mrs == 2 * ent->limit) {
504 ent->fill_to_high_water = false;
505 } else if (ent->available_mrs > 2 * ent->limit) {
506 /* Queue deletion of excess entries */
507 ent->fill_to_high_water = false;
509 queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
510 msecs_to_jiffies(1000));
512 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
516 static void __cache_work_func(struct mlx5_cache_ent *ent)
518 struct mlx5_ib_dev *dev = ent->dev;
519 struct mlx5_mr_cache *cache = &dev->cache;
522 spin_lock_irq(&ent->lock);
526 if (ent->fill_to_high_water &&
527 ent->available_mrs + ent->pending < 2 * ent->limit &&
528 !READ_ONCE(dev->fill_delay)) {
529 spin_unlock_irq(&ent->lock);
530 err = add_keys(ent, 1);
531 spin_lock_irq(&ent->lock);
536 * EAGAIN only happens if pending is positive, so we
537 * will be rescheduled from reg_mr_callback(). The only
538 * failure path here is ENOMEM.
540 if (err != -EAGAIN) {
543 "command failed order %d, err %d\n",
545 queue_delayed_work(cache->wq, &ent->dwork,
546 msecs_to_jiffies(1000));
549 } else if (ent->available_mrs > 2 * ent->limit) {
553 * The remove_cache_mr() logic is performed as garbage
554 * collection task. Such task is intended to be run when no
555 * other active processes are running.
557 * The need_resched() will return TRUE if there are user tasks
558 * to be activated in near future.
560 * In such case, we don't execute remove_cache_mr() and postpone
561 * the garbage collection work to try to run in next cycle, in
562 * order to free CPU resources to other tasks.
564 spin_unlock_irq(&ent->lock);
565 need_delay = need_resched() || someone_adding(cache) ||
567 READ_ONCE(cache->last_add) + 300 * HZ);
568 spin_lock_irq(&ent->lock);
572 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
573 remove_cache_mr_locked(ent);
574 queue_adjust_cache_locked(ent);
577 spin_unlock_irq(&ent->lock);
580 static void delayed_cache_work_func(struct work_struct *work)
582 struct mlx5_cache_ent *ent;
584 ent = container_of(work, struct mlx5_cache_ent, dwork.work);
585 __cache_work_func(ent);
588 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
589 struct mlx5_cache_ent *ent,
592 struct mlx5_ib_mr *mr;
594 /* Matches access in alloc_cache_mr() */
595 if (!mlx5r_umr_can_reconfig(dev, 0, access_flags))
596 return ERR_PTR(-EOPNOTSUPP);
598 spin_lock_irq(&ent->lock);
599 if (list_empty(&ent->head)) {
600 queue_adjust_cache_locked(ent);
602 spin_unlock_irq(&ent->lock);
603 mr = create_cache_mr(ent);
607 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
609 ent->available_mrs--;
610 queue_adjust_cache_locked(ent);
611 spin_unlock_irq(&ent->lock);
618 static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
620 struct mlx5_cache_ent *ent = mr->cache_ent;
622 spin_lock_irq(&ent->lock);
623 list_add_tail(&mr->list, &ent->head);
624 ent->available_mrs++;
625 queue_adjust_cache_locked(ent);
626 spin_unlock_irq(&ent->lock);
629 static void clean_keys(struct mlx5_ib_dev *dev, int c)
631 struct mlx5_mr_cache *cache = &dev->cache;
632 struct mlx5_cache_ent *ent = &cache->ent[c];
633 struct mlx5_ib_mr *tmp_mr;
634 struct mlx5_ib_mr *mr;
637 cancel_delayed_work(&ent->dwork);
639 spin_lock_irq(&ent->lock);
640 if (list_empty(&ent->head)) {
641 spin_unlock_irq(&ent->lock);
644 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
645 list_move(&mr->list, &del_list);
646 ent->available_mrs--;
648 spin_unlock_irq(&ent->lock);
649 mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
652 list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
658 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
660 if (!mlx5_debugfs_root || dev->is_rep)
663 debugfs_remove_recursive(dev->cache.root);
664 dev->cache.root = NULL;
667 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
669 struct mlx5_mr_cache *cache = &dev->cache;
670 struct mlx5_cache_ent *ent;
674 if (!mlx5_debugfs_root || dev->is_rep)
677 cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev));
679 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
680 ent = &cache->ent[i];
681 sprintf(ent->name, "%d", ent->order);
682 dir = debugfs_create_dir(ent->name, cache->root);
683 debugfs_create_file("size", 0600, dir, ent, &size_fops);
684 debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
685 debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
686 debugfs_create_u32("miss", 0600, dir, &ent->miss);
690 static void delay_time_func(struct timer_list *t)
692 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
694 WRITE_ONCE(dev->fill_delay, 0);
697 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
699 struct mlx5_mr_cache *cache = &dev->cache;
700 struct mlx5_cache_ent *ent;
703 mutex_init(&dev->slow_path_mutex);
704 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
706 mlx5_ib_warn(dev, "failed to create work queue\n");
710 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
711 timer_setup(&dev->delay_timer, delay_time_func, 0);
712 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
713 ent = &cache->ent[i];
714 INIT_LIST_HEAD(&ent->head);
715 spin_lock_init(&ent->lock);
720 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
722 if (i > MR_CACHE_LAST_STD_ENTRY) {
723 mlx5_odp_init_mr_cache_entry(ent);
727 if (ent->order > mr_cache_max_order(dev))
730 ent->page = PAGE_SHIFT;
731 ent->ndescs = 1 << ent->order;
732 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
733 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
734 !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
735 mlx5r_umr_can_load_pas(dev, 0))
736 ent->limit = dev->mdev->profile.mr_cache[i].limit;
739 spin_lock_irq(&ent->lock);
740 queue_adjust_cache_locked(ent);
741 spin_unlock_irq(&ent->lock);
744 mlx5_mr_cache_debugfs_init(dev);
749 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
756 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
757 struct mlx5_cache_ent *ent = &dev->cache.ent[i];
759 spin_lock_irq(&ent->lock);
760 ent->disabled = true;
761 spin_unlock_irq(&ent->lock);
762 cancel_delayed_work_sync(&ent->dwork);
765 mlx5_mr_cache_debugfs_cleanup(dev);
766 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
768 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
771 destroy_workqueue(dev->cache.wq);
772 del_timer_sync(&dev->delay_timer);
777 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
779 struct mlx5_ib_dev *dev = to_mdev(pd->device);
780 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
781 struct mlx5_ib_mr *mr;
786 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
788 return ERR_PTR(-ENOMEM);
790 in = kzalloc(inlen, GFP_KERNEL);
796 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
798 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
799 MLX5_SET(mkc, mkc, length64, 1);
800 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
803 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
808 mr->mmkey.type = MLX5_MKEY_MR;
809 mr->ibmr.lkey = mr->mmkey.key;
810 mr->ibmr.rkey = mr->mmkey.key;
824 static int get_octo_len(u64 addr, u64 len, int page_shift)
826 u64 page_size = 1ULL << page_shift;
830 offset = addr & (page_size - 1);
831 npages = ALIGN(len + offset, page_size) >> page_shift;
832 return (npages + 1) / 2;
835 static int mr_cache_max_order(struct mlx5_ib_dev *dev)
837 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
838 return MR_CACHE_LAST_STD_ENTRY + 2;
839 return MLX5_MAX_UMR_SHIFT;
842 static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
844 struct mlx5_ib_umr_context *context =
845 container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
847 context->status = wc->status;
848 complete(&context->done);
851 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
853 context->cqe.done = mlx5_ib_umr_done;
854 context->status = -1;
855 init_completion(&context->done);
858 static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
859 struct mlx5_umr_wr *umrwr)
861 struct umr_common *umrc = &dev->umrc;
862 const struct ib_send_wr *bad;
864 struct mlx5_ib_umr_context umr_context;
866 mlx5_ib_init_umr_context(&umr_context);
867 umrwr->wr.wr_cqe = &umr_context.cqe;
870 err = ib_post_send(umrc->qp, &umrwr->wr, &bad);
872 mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
874 wait_for_completion(&umr_context.done);
875 if (umr_context.status != IB_WC_SUCCESS) {
876 mlx5_ib_warn(dev, "reg umr failed (%u)\n",
885 static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
888 struct mlx5_mr_cache *cache = &dev->cache;
890 if (order < cache->ent[0].order)
891 return &cache->ent[0];
892 order = order - cache->ent[0].order;
893 if (order > MR_CACHE_LAST_STD_ENTRY)
895 return &cache->ent[order];
898 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
899 u64 length, int access_flags, u64 iova)
901 mr->ibmr.lkey = mr->mmkey.key;
902 mr->ibmr.rkey = mr->mmkey.key;
903 mr->ibmr.length = length;
904 mr->ibmr.device = &dev->ib_dev;
905 mr->ibmr.iova = iova;
906 mr->access_flags = access_flags;
909 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
913 * The alignment of iova has already been checked upon entering
914 * UVERBS_METHOD_REG_DMABUF_MR
920 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
921 struct ib_umem *umem, u64 iova,
924 struct mlx5_ib_dev *dev = to_mdev(pd->device);
925 struct mlx5_cache_ent *ent;
926 struct mlx5_ib_mr *mr;
927 unsigned int page_size;
930 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
932 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size,
934 if (WARN_ON(!page_size))
935 return ERR_PTR(-EINVAL);
936 ent = mr_cache_ent_from_order(
937 dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
939 * Matches access in alloc_cache_mr(). If the MR can't come from the
940 * cache then synchronously create an uncached one.
942 if (!ent || ent->limit == 0 ||
943 !mlx5r_umr_can_reconfig(dev, 0, access_flags)) {
944 mutex_lock(&dev->slow_path_mutex);
945 mr = reg_create(pd, umem, iova, access_flags, page_size, false);
946 mutex_unlock(&dev->slow_path_mutex);
950 mr = mlx5_mr_cache_alloc(dev, ent, access_flags);
956 mr->page_shift = order_base_2(page_size);
957 set_mr_fields(dev, mr, umem->length, access_flags, iova);
963 * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for
966 static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr,
967 struct mlx5_umr_wr *wr, struct ib_sge *sg,
968 size_t nents, size_t ent_size,
971 struct mlx5_ib_dev *dev = mr_to_mdev(mr);
974 xlt = mlx5r_umr_create_xlt(dev, sg, nents, ent_size, flags);
976 memset(wr, 0, sizeof(*wr));
977 wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT;
978 if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
979 wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE;
982 wr->wr.opcode = MLX5_IB_WR_UMR;
983 wr->pd = mr->ibmr.pd;
984 wr->mkey = mr->mmkey.key;
985 wr->length = mr->ibmr.length;
986 wr->virt_addr = mr->ibmr.iova;
987 wr->access_flags = mr->access_flags;
988 wr->page_shift = mr->page_shift;
989 wr->xlt_size = sg->length;
993 static unsigned int xlt_wr_final_send_flags(unsigned int flags)
995 unsigned int res = 0;
997 if (flags & MLX5_IB_UPD_XLT_ENABLE)
998 res |= MLX5_IB_SEND_UMR_ENABLE_MR |
999 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS |
1000 MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1001 if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS)
1002 res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
1003 if (flags & MLX5_IB_UPD_XLT_ADDR)
1004 res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1008 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
1009 int page_shift, int flags)
1011 struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1012 struct device *ddev = &dev->mdev->pdev->dev;
1014 struct mlx5_umr_wr wr;
1017 int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
1018 ? sizeof(struct mlx5_klm)
1019 : sizeof(struct mlx5_mtt);
1020 const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
1021 const int page_mask = page_align - 1;
1022 size_t pages_mapped = 0;
1023 size_t pages_to_map = 0;
1025 size_t size_to_map = 0;
1026 size_t orig_sg_length;
1028 if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
1029 !umr_can_use_indirect_mkey(dev))
1032 if (WARN_ON(!mr->umem->is_odp))
1035 /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
1036 * so we need to align the offset and length accordingly
1038 if (idx & page_mask) {
1039 npages += idx & page_mask;
1042 pages_to_map = ALIGN(npages, page_align);
1044 xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags);
1047 pages_iter = sg.length / desc_size;
1048 orig_sg_length = sg.length;
1050 if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
1051 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1052 size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
1054 pages_to_map = min_t(size_t, pages_to_map, max_pages);
1057 wr.page_shift = page_shift;
1059 for (pages_mapped = 0;
1060 pages_mapped < pages_to_map && !err;
1061 pages_mapped += pages_iter, idx += pages_iter) {
1062 npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
1063 size_to_map = npages * desc_size;
1064 dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1066 mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
1067 dma_sync_single_for_device(ddev, sg.addr, sg.length,
1070 sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
1072 if (pages_mapped + pages_iter >= pages_to_map)
1073 wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1075 wr.offset = idx * desc_size;
1076 wr.xlt_size = sg.length;
1078 err = mlx5_ib_post_send_wait(dev, &wr);
1080 sg.length = orig_sg_length;
1081 mlx5r_umr_unmap_free_xlt(dev, xlt, &sg);
1086 * Send the DMA list to the HW for a normal MR using UMR.
1087 * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
1090 int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
1092 struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1093 struct device *ddev = &dev->mdev->pdev->dev;
1094 struct ib_block_iter biter;
1095 struct mlx5_mtt *cur_mtt;
1096 struct mlx5_umr_wr wr;
1097 size_t orig_sg_length;
1098 struct mlx5_mtt *mtt;
1103 if (WARN_ON(mr->umem->is_odp))
1106 mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg,
1107 ib_umem_num_dma_blocks(mr->umem,
1108 1 << mr->page_shift),
1109 sizeof(*mtt), flags);
1112 orig_sg_length = sg.length;
1115 rdma_for_each_block (mr->umem->sgt_append.sgt.sgl, &biter,
1116 mr->umem->sgt_append.sgt.nents,
1117 BIT(mr->page_shift)) {
1118 if (cur_mtt == (void *)mtt + sg.length) {
1119 dma_sync_single_for_device(ddev, sg.addr, sg.length,
1121 err = mlx5_ib_post_send_wait(dev, &wr);
1124 dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
1126 wr.offset += sg.length;
1131 cpu_to_be64(rdma_block_iter_dma_address(&biter) |
1132 MLX5_IB_MTT_PRESENT);
1134 if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
1140 final_size = (void *)cur_mtt - (void *)mtt;
1141 sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT);
1142 memset(cur_mtt, 0, sg.length - final_size);
1143 wr.wr.send_flags |= xlt_wr_final_send_flags(flags);
1144 wr.xlt_size = sg.length;
1146 dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
1147 err = mlx5_ib_post_send_wait(dev, &wr);
1150 sg.length = orig_sg_length;
1151 mlx5r_umr_unmap_free_xlt(dev, mtt, &sg);
1156 * If ibmr is NULL it will be allocated by reg_create.
1157 * Else, the given ibmr will be used.
1159 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1160 u64 iova, int access_flags,
1161 unsigned int page_size, bool populate)
1163 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1164 struct mlx5_ib_mr *mr;
1170 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
1173 return ERR_PTR(-EINVAL);
1174 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1176 return ERR_PTR(-ENOMEM);
1179 mr->access_flags = access_flags;
1180 mr->page_shift = order_base_2(page_size);
1182 inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1184 inlen += sizeof(*pas) *
1185 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1186 in = kvzalloc(inlen, GFP_KERNEL);
1191 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1193 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
1197 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1198 pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1201 /* The pg_access bit allows setting the access flags
1202 * in the page list submitted with the command. */
1203 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1205 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1206 set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1207 populate ? pd : dev->umrc.pd);
1208 MLX5_SET(mkc, mkc, free, !populate);
1209 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
1210 MLX5_SET(mkc, mkc, umr_en, 1);
1212 MLX5_SET64(mkc, mkc, len, umem->length);
1213 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1214 MLX5_SET(mkc, mkc, translations_octword_size,
1215 get_octo_len(iova, umem->length, mr->page_shift));
1216 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1218 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1219 get_octo_len(iova, umem->length, mr->page_shift));
1222 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1224 mlx5_ib_warn(dev, "create mkey failed\n");
1227 mr->mmkey.type = MLX5_MKEY_MR;
1229 set_mr_fields(dev, mr, umem->length, access_flags, iova);
1232 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1240 return ERR_PTR(err);
1243 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1244 u64 length, int acc, int mode)
1246 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1247 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1248 struct mlx5_ib_mr *mr;
1253 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1255 return ERR_PTR(-ENOMEM);
1257 in = kzalloc(inlen, GFP_KERNEL);
1263 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1265 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1266 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1267 MLX5_SET64(mkc, mkc, len, length);
1268 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1270 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1276 set_mr_fields(dev, mr, length, acc, start_addr);
1286 return ERR_PTR(err);
1289 int mlx5_ib_advise_mr(struct ib_pd *pd,
1290 enum ib_uverbs_advise_mr_advice advice,
1292 struct ib_sge *sg_list,
1294 struct uverbs_attr_bundle *attrs)
1296 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1297 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1298 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1301 return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1305 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1306 struct ib_dm_mr_attr *attr,
1307 struct uverbs_attr_bundle *attrs)
1309 struct mlx5_ib_dm *mdm = to_mdm(dm);
1310 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1311 u64 start_addr = mdm->dev_addr + attr->offset;
1314 switch (mdm->type) {
1315 case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1316 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1317 return ERR_PTR(-EINVAL);
1319 mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1320 start_addr -= pci_resource_start(dev->pdev, 0);
1322 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1323 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1324 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1325 return ERR_PTR(-EINVAL);
1327 mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1330 return ERR_PTR(-EINVAL);
1333 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1334 attr->access_flags, mode);
1337 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1338 u64 iova, int access_flags)
1340 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1341 struct mlx5_ib_mr *mr = NULL;
1345 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
1347 mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
1349 unsigned int page_size = mlx5_umem_find_best_pgsz(
1350 umem, mkc, log_page_size, 0, iova);
1352 mutex_lock(&dev->slow_path_mutex);
1353 mr = reg_create(pd, umem, iova, access_flags, page_size, true);
1354 mutex_unlock(&dev->slow_path_mutex);
1357 ib_umem_release(umem);
1358 return ERR_CAST(mr);
1361 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1363 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1367 * If the MR was created with reg_create then it will be
1368 * configured properly but left disabled. It is safe to go ahead
1369 * and configure it again via UMR while enabling it.
1371 err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1373 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1374 return ERR_PTR(err);
1380 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1381 u64 iova, int access_flags,
1382 struct ib_udata *udata)
1384 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1385 struct ib_umem_odp *odp;
1386 struct mlx5_ib_mr *mr;
1389 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1390 return ERR_PTR(-EOPNOTSUPP);
1392 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1394 return ERR_PTR(err);
1395 if (!start && length == U64_MAX) {
1397 return ERR_PTR(-EINVAL);
1398 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1399 return ERR_PTR(-EINVAL);
1401 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1403 return ERR_CAST(mr);
1407 /* ODP requires xlt update via umr to work. */
1408 if (!mlx5r_umr_can_load_pas(dev, length))
1409 return ERR_PTR(-EINVAL);
1411 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1414 return ERR_CAST(odp);
1416 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
1418 ib_umem_release(&odp->umem);
1419 return ERR_CAST(mr);
1421 xa_init(&mr->implicit_children);
1424 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1428 err = mlx5_ib_init_odp_mr(mr);
1434 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1435 return ERR_PTR(err);
1438 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1439 u64 iova, int access_flags,
1440 struct ib_udata *udata)
1442 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1443 struct ib_umem *umem;
1445 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1446 return ERR_PTR(-EOPNOTSUPP);
1448 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1449 start, iova, length, access_flags);
1451 if (access_flags & IB_ACCESS_ON_DEMAND)
1452 return create_user_odp_mr(pd, start, length, iova, access_flags,
1454 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1456 return ERR_CAST(umem);
1457 return create_real_mr(pd, umem, iova, access_flags);
1460 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1462 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1463 struct mlx5_ib_mr *mr = umem_dmabuf->private;
1465 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1467 if (!umem_dmabuf->sgt)
1470 mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1471 ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1474 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1475 .allow_peer2peer = 1,
1476 .move_notify = mlx5_ib_dmabuf_invalidate_cb,
1479 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1480 u64 length, u64 virt_addr,
1481 int fd, int access_flags,
1482 struct ib_udata *udata)
1484 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1485 struct mlx5_ib_mr *mr = NULL;
1486 struct ib_umem_dmabuf *umem_dmabuf;
1489 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1490 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1491 return ERR_PTR(-EOPNOTSUPP);
1494 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
1495 offset, virt_addr, length, fd, access_flags);
1497 /* dmabuf requires xlt update via umr to work. */
1498 if (!mlx5r_umr_can_load_pas(dev, length))
1499 return ERR_PTR(-EINVAL);
1501 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
1503 &mlx5_ib_dmabuf_attach_ops);
1504 if (IS_ERR(umem_dmabuf)) {
1505 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1506 PTR_ERR(umem_dmabuf));
1507 return ERR_CAST(umem_dmabuf);
1510 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1513 ib_umem_release(&umem_dmabuf->umem);
1514 return ERR_CAST(mr);
1517 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1519 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1520 umem_dmabuf->private = mr;
1521 err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1525 err = mlx5_ib_init_dmabuf_mr(mr);
1531 mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1532 return ERR_PTR(err);
1536 * True if the change in access flags can be done via UMR, only some access
1537 * flags can be updated.
1539 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1540 unsigned int current_access_flags,
1541 unsigned int target_access_flags)
1543 unsigned int diffs = current_access_flags ^ target_access_flags;
1545 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1546 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING))
1548 return mlx5r_umr_can_reconfig(dev, current_access_flags,
1549 target_access_flags);
1552 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1553 struct ib_umem *new_umem,
1554 int new_access_flags, u64 iova,
1555 unsigned long *page_size)
1557 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1559 /* We only track the allocated sizes of MRs from the cache */
1562 if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
1566 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
1567 if (WARN_ON(!*page_size))
1569 return (1ULL << mr->cache_ent->order) >=
1570 ib_umem_num_dma_blocks(new_umem, *page_size);
1573 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1574 int access_flags, int flags, struct ib_umem *new_umem,
1575 u64 iova, unsigned long page_size)
1577 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1578 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1579 struct ib_umem *old_umem = mr->umem;
1583 * To keep everything simple the MR is revoked before we start to mess
1584 * with it. This ensure the change is atomic relative to any use of the
1587 err = mlx5r_umr_revoke_mr(mr);
1591 if (flags & IB_MR_REREG_PD) {
1593 upd_flags |= MLX5_IB_UPD_XLT_PD;
1595 if (flags & IB_MR_REREG_ACCESS) {
1596 mr->access_flags = access_flags;
1597 upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1600 mr->ibmr.length = new_umem->length;
1601 mr->ibmr.iova = iova;
1602 mr->ibmr.length = new_umem->length;
1603 mr->page_shift = order_base_2(page_size);
1604 mr->umem = new_umem;
1605 err = mlx5_ib_update_mr_pas(mr, upd_flags);
1608 * The MR is revoked at this point so there is no issue to free
1611 mr->umem = old_umem;
1615 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1616 ib_umem_release(old_umem);
1617 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1621 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1622 u64 length, u64 iova, int new_access_flags,
1623 struct ib_pd *new_pd,
1624 struct ib_udata *udata)
1626 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1627 struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1630 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1631 return ERR_PTR(-EOPNOTSUPP);
1635 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1636 start, iova, length, new_access_flags);
1638 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1639 return ERR_PTR(-EOPNOTSUPP);
1641 if (!(flags & IB_MR_REREG_ACCESS))
1642 new_access_flags = mr->access_flags;
1643 if (!(flags & IB_MR_REREG_PD))
1646 if (!(flags & IB_MR_REREG_TRANS)) {
1647 struct ib_umem *umem;
1649 /* Fast path for PD/access change */
1650 if (can_use_umr_rereg_access(dev, mr->access_flags,
1651 new_access_flags)) {
1652 err = mlx5r_umr_rereg_pd_access(mr, new_pd,
1655 return ERR_PTR(err);
1658 /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1659 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1663 * Only one active MR can refer to a umem at one time, revoke
1664 * the old MR before assigning the umem to the new one.
1666 err = mlx5r_umr_revoke_mr(mr);
1668 return ERR_PTR(err);
1671 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1673 return create_real_mr(new_pd, umem, mr->ibmr.iova,
1678 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1679 * but the logic around releasing the umem is different
1681 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1684 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1685 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1686 struct ib_umem *new_umem;
1687 unsigned long page_size;
1689 new_umem = ib_umem_get(&dev->ib_dev, start, length,
1691 if (IS_ERR(new_umem))
1692 return ERR_CAST(new_umem);
1694 /* Fast path for PAS change */
1695 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1697 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1698 new_umem, iova, page_size);
1700 ib_umem_release(new_umem);
1701 return ERR_PTR(err);
1705 return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1709 * Everything else has no state we can preserve, just create a new MR
1713 return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1714 new_access_flags, udata);
1718 mlx5_alloc_priv_descs(struct ib_device *device,
1719 struct mlx5_ib_mr *mr,
1723 struct mlx5_ib_dev *dev = to_mdev(device);
1724 struct device *ddev = &dev->mdev->pdev->dev;
1725 int size = ndescs * desc_size;
1729 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1731 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1732 if (!mr->descs_alloc)
1735 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1737 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1738 if (dma_mapping_error(ddev, mr->desc_map)) {
1745 kfree(mr->descs_alloc);
1751 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1753 if (!mr->umem && mr->descs) {
1754 struct ib_device *device = mr->ibmr.device;
1755 int size = mr->max_descs * mr->desc_size;
1756 struct mlx5_ib_dev *dev = to_mdev(device);
1758 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1760 kfree(mr->descs_alloc);
1765 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1767 struct mlx5_ib_mr *mr = to_mmr(ibmr);
1768 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1772 * Any async use of the mr must hold the refcount, once the refcount
1773 * goes to zero no other thread, such as ODP page faults, prefetch, any
1774 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
1776 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
1777 refcount_read(&mr->mmkey.usecount) != 0 &&
1778 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
1779 mlx5r_deref_wait_odp_mkey(&mr->mmkey);
1781 if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
1782 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
1783 mr->sig, NULL, GFP_KERNEL);
1786 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
1792 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
1798 if (mlx5_core_destroy_psv(dev->mdev,
1799 mr->sig->psv_memory.psv_idx))
1800 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1801 mr->sig->psv_memory.psv_idx);
1802 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
1803 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1804 mr->sig->psv_wire.psv_idx);
1810 if (mr->cache_ent) {
1811 if (mlx5r_umr_revoke_mr(mr)) {
1812 spin_lock_irq(&mr->cache_ent->lock);
1813 mr->cache_ent->total_mrs--;
1814 spin_unlock_irq(&mr->cache_ent->lock);
1815 mr->cache_ent = NULL;
1818 if (!mr->cache_ent) {
1819 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
1825 bool is_odp = is_odp_mr(mr);
1828 atomic_sub(ib_umem_num_pages(mr->umem),
1829 &dev->mdev->priv.reg_pages);
1830 ib_umem_release(mr->umem);
1832 mlx5_ib_free_odp_mr(mr);
1835 if (mr->cache_ent) {
1836 mlx5_mr_cache_free(dev, mr);
1838 mlx5_free_priv_descs(mr);
1844 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
1845 int access_mode, int page_shift)
1849 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1851 /* This is only used from the kernel, so setting the PD is OK. */
1852 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
1853 MLX5_SET(mkc, mkc, free, 1);
1854 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1855 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1856 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1857 MLX5_SET(mkc, mkc, umr_en, 1);
1858 MLX5_SET(mkc, mkc, log_page_size, page_shift);
1861 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1862 int ndescs, int desc_size, int page_shift,
1863 int access_mode, u32 *in, int inlen)
1865 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1868 mr->access_mode = access_mode;
1869 mr->desc_size = desc_size;
1870 mr->max_descs = ndescs;
1872 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
1876 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
1878 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1880 goto err_free_descs;
1882 mr->mmkey.type = MLX5_MKEY_MR;
1883 mr->ibmr.lkey = mr->mmkey.key;
1884 mr->ibmr.rkey = mr->mmkey.key;
1889 mlx5_free_priv_descs(mr);
1893 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
1894 u32 max_num_sg, u32 max_num_meta_sg,
1895 int desc_size, int access_mode)
1897 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1898 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
1900 struct mlx5_ib_mr *mr;
1904 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1906 return ERR_PTR(-ENOMEM);
1909 mr->ibmr.device = pd->device;
1911 in = kzalloc(inlen, GFP_KERNEL);
1917 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
1918 page_shift = PAGE_SHIFT;
1920 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
1921 access_mode, in, inlen);
1934 return ERR_PTR(err);
1937 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1938 int ndescs, u32 *in, int inlen)
1940 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
1941 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
1945 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1946 int ndescs, u32 *in, int inlen)
1948 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
1949 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
1952 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
1953 int max_num_sg, int max_num_meta_sg,
1956 struct mlx5_ib_dev *dev = to_mdev(pd->device);
1961 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1965 /* create mem & wire PSVs */
1966 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
1970 mr->sig->psv_memory.psv_idx = psv_index[0];
1971 mr->sig->psv_wire.psv_idx = psv_index[1];
1973 mr->sig->sig_status_checked = true;
1974 mr->sig->sig_err_exists = false;
1975 /* Next UMR, Arm SIGERR */
1976 ++mr->sig->sigerr_count;
1977 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
1978 sizeof(struct mlx5_klm),
1979 MLX5_MKC_ACCESS_MODE_KLMS);
1980 if (IS_ERR(mr->klm_mr)) {
1981 err = PTR_ERR(mr->klm_mr);
1982 goto err_destroy_psv;
1984 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
1985 sizeof(struct mlx5_mtt),
1986 MLX5_MKC_ACCESS_MODE_MTT);
1987 if (IS_ERR(mr->mtt_mr)) {
1988 err = PTR_ERR(mr->mtt_mr);
1989 goto err_free_klm_mr;
1992 /* Set bsf descriptors for mkey */
1993 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1994 MLX5_SET(mkc, mkc, bsf_en, 1);
1995 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
1997 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
1998 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2000 goto err_free_mtt_mr;
2002 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2003 mr->sig, GFP_KERNEL));
2005 goto err_free_descs;
2009 destroy_mkey(dev, mr);
2010 mlx5_free_priv_descs(mr);
2012 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2015 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2018 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2019 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2020 mr->sig->psv_memory.psv_idx);
2021 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2022 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2023 mr->sig->psv_wire.psv_idx);
2030 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2031 enum ib_mr_type mr_type, u32 max_num_sg,
2032 u32 max_num_meta_sg)
2034 struct mlx5_ib_dev *dev = to_mdev(pd->device);
2035 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2036 int ndescs = ALIGN(max_num_sg, 4);
2037 struct mlx5_ib_mr *mr;
2041 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2043 return ERR_PTR(-ENOMEM);
2045 in = kzalloc(inlen, GFP_KERNEL);
2051 mr->ibmr.device = pd->device;
2055 case IB_MR_TYPE_MEM_REG:
2056 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2058 case IB_MR_TYPE_SG_GAPS:
2059 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2061 case IB_MR_TYPE_INTEGRITY:
2062 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2063 max_num_meta_sg, in, inlen);
2066 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2081 return ERR_PTR(err);
2084 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2087 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2090 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2091 u32 max_num_sg, u32 max_num_meta_sg)
2093 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2097 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2099 struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2100 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2101 struct mlx5_ib_mw *mw = to_mmw(ibmw);
2102 unsigned int ndescs;
2106 struct mlx5_ib_alloc_mw req = {};
2109 __u32 response_length;
2112 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2116 if (req.comp_mask || req.reserved1 || req.reserved2)
2119 if (udata->inlen > sizeof(req) &&
2120 !ib_is_udata_cleared(udata, sizeof(req),
2121 udata->inlen - sizeof(req)))
2124 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2126 in = kzalloc(inlen, GFP_KERNEL);
2132 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2134 MLX5_SET(mkc, mkc, free, 1);
2135 MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2136 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2137 MLX5_SET(mkc, mkc, umr_en, 1);
2138 MLX5_SET(mkc, mkc, lr, 1);
2139 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2140 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2141 MLX5_SET(mkc, mkc, qpn, 0xffffff);
2143 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2147 mw->mmkey.type = MLX5_MKEY_MW;
2148 ibmw->rkey = mw->mmkey.key;
2149 mw->mmkey.ndescs = ndescs;
2151 resp.response_length =
2152 min(offsetofend(typeof(resp), response_length), udata->outlen);
2153 if (resp.response_length) {
2154 err = ib_copy_to_udata(udata, &resp, resp.response_length);
2159 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2160 err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2169 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2175 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2177 struct mlx5_ib_dev *dev = to_mdev(mw->device);
2178 struct mlx5_ib_mw *mmw = to_mmw(mw);
2180 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2181 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2183 * pagefault_single_data_segment() may be accessing mmw
2184 * if the user bound an ODP MR to this MW.
2186 mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2188 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2191 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2192 struct ib_mr_status *mr_status)
2194 struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2197 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2198 pr_err("Invalid status check mask\n");
2203 mr_status->fail_status = 0;
2204 if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2207 pr_err("signature status check requested on a non-signature enabled MR\n");
2211 mmr->sig->sig_status_checked = true;
2212 if (!mmr->sig->sig_err_exists)
2215 if (ibmr->lkey == mmr->sig->err_item.key)
2216 memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2217 sizeof(mr_status->sig_err));
2219 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2220 mr_status->sig_err.sig_err_offset = 0;
2221 mr_status->sig_err.key = mmr->sig->err_item.key;
2224 mmr->sig->sig_err_exists = false;
2225 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2233 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2234 int data_sg_nents, unsigned int *data_sg_offset,
2235 struct scatterlist *meta_sg, int meta_sg_nents,
2236 unsigned int *meta_sg_offset)
2238 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2239 unsigned int sg_offset = 0;
2242 mr->meta_length = 0;
2243 if (data_sg_nents == 1) {
2245 mr->mmkey.ndescs = 1;
2247 sg_offset = *data_sg_offset;
2248 mr->data_length = sg_dma_len(data_sg) - sg_offset;
2249 mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2250 if (meta_sg_nents == 1) {
2252 mr->meta_ndescs = 1;
2254 sg_offset = *meta_sg_offset;
2257 mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2258 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2260 ibmr->length = mr->data_length + mr->meta_length;
2267 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2268 struct scatterlist *sgl,
2269 unsigned short sg_nents,
2270 unsigned int *sg_offset_p,
2271 struct scatterlist *meta_sgl,
2272 unsigned short meta_sg_nents,
2273 unsigned int *meta_sg_offset_p)
2275 struct scatterlist *sg = sgl;
2276 struct mlx5_klm *klms = mr->descs;
2277 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2278 u32 lkey = mr->ibmr.pd->local_dma_lkey;
2281 mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2282 mr->ibmr.length = 0;
2284 for_each_sg(sgl, sg, sg_nents, i) {
2285 if (unlikely(i >= mr->max_descs))
2287 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2288 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2289 klms[i].key = cpu_to_be32(lkey);
2290 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2296 *sg_offset_p = sg_offset;
2298 mr->mmkey.ndescs = i;
2299 mr->data_length = mr->ibmr.length;
2301 if (meta_sg_nents) {
2303 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2304 for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2305 if (unlikely(i + j >= mr->max_descs))
2307 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2309 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2311 klms[i + j].key = cpu_to_be32(lkey);
2312 mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2316 if (meta_sg_offset_p)
2317 *meta_sg_offset_p = sg_offset;
2319 mr->meta_ndescs = j;
2320 mr->meta_length = mr->ibmr.length - mr->data_length;
2326 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2328 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2331 if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2335 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2340 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2342 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2345 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2349 descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2350 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2356 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2357 int data_sg_nents, unsigned int *data_sg_offset,
2358 struct scatterlist *meta_sg, int meta_sg_nents,
2359 unsigned int *meta_sg_offset)
2361 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2362 struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2365 pi_mr->mmkey.ndescs = 0;
2366 pi_mr->meta_ndescs = 0;
2367 pi_mr->meta_length = 0;
2369 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2370 pi_mr->desc_size * pi_mr->max_descs,
2373 pi_mr->ibmr.page_size = ibmr->page_size;
2374 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2376 if (n != data_sg_nents)
2379 pi_mr->data_iova = pi_mr->ibmr.iova;
2380 pi_mr->data_length = pi_mr->ibmr.length;
2381 pi_mr->ibmr.length = pi_mr->data_length;
2382 ibmr->length = pi_mr->data_length;
2384 if (meta_sg_nents) {
2385 u64 page_mask = ~((u64)ibmr->page_size - 1);
2386 u64 iova = pi_mr->data_iova;
2388 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2389 meta_sg_offset, mlx5_set_page_pi);
2391 pi_mr->meta_length = pi_mr->ibmr.length;
2393 * PI address for the HW is the offset of the metadata address
2394 * relative to the first data page address.
2395 * It equals to first data page address + size of data pages +
2396 * metadata offset at the first metadata page
2398 pi_mr->pi_iova = (iova & page_mask) +
2399 pi_mr->mmkey.ndescs * ibmr->page_size +
2400 (pi_mr->ibmr.iova & ~page_mask);
2402 * In order to use one MTT MR for data and metadata, we register
2403 * also the gaps between the end of the data and the start of
2404 * the metadata (the sig MR will verify that the HW will access
2405 * to right addresses). This mapping is safe because we use
2406 * internal mkey for the registration.
2408 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2409 pi_mr->ibmr.iova = iova;
2410 ibmr->length += pi_mr->meta_length;
2413 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2414 pi_mr->desc_size * pi_mr->max_descs,
2421 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2422 int data_sg_nents, unsigned int *data_sg_offset,
2423 struct scatterlist *meta_sg, int meta_sg_nents,
2424 unsigned int *meta_sg_offset)
2426 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2427 struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2430 pi_mr->mmkey.ndescs = 0;
2431 pi_mr->meta_ndescs = 0;
2432 pi_mr->meta_length = 0;
2434 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2435 pi_mr->desc_size * pi_mr->max_descs,
2438 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2439 meta_sg, meta_sg_nents, meta_sg_offset);
2441 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2442 pi_mr->desc_size * pi_mr->max_descs,
2445 /* This is zero-based memory region */
2446 pi_mr->data_iova = 0;
2447 pi_mr->ibmr.iova = 0;
2448 pi_mr->pi_iova = pi_mr->data_length;
2449 ibmr->length = pi_mr->ibmr.length;
2454 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2455 int data_sg_nents, unsigned int *data_sg_offset,
2456 struct scatterlist *meta_sg, int meta_sg_nents,
2457 unsigned int *meta_sg_offset)
2459 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2460 struct mlx5_ib_mr *pi_mr = NULL;
2463 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2465 mr->mmkey.ndescs = 0;
2466 mr->data_length = 0;
2468 mr->meta_ndescs = 0;
2471 * As a performance optimization, if possible, there is no need to
2472 * perform UMR operation to register the data/metadata buffers.
2473 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2474 * Fallback to UMR only in case of a failure.
2476 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2477 data_sg_offset, meta_sg, meta_sg_nents,
2479 if (n == data_sg_nents + meta_sg_nents)
2482 * As a performance optimization, if possible, there is no need to map
2483 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2484 * descriptors and fallback to KLM only in case of a failure.
2485 * It's more efficient for the HW to work with MTT descriptors
2486 * (especially in high load).
2487 * Use KLM (indirect access) only if it's mandatory.
2490 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2491 data_sg_offset, meta_sg, meta_sg_nents,
2493 if (n == data_sg_nents + meta_sg_nents)
2497 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2498 data_sg_offset, meta_sg, meta_sg_nents,
2500 if (unlikely(n != data_sg_nents + meta_sg_nents))
2504 /* This is zero-based memory region */
2508 ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2510 ibmr->sig_attrs->meta_length = mr->meta_length;
2515 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2516 unsigned int *sg_offset)
2518 struct mlx5_ib_mr *mr = to_mmr(ibmr);
2521 mr->mmkey.ndescs = 0;
2523 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2524 mr->desc_size * mr->max_descs,
2527 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2528 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2531 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2534 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2535 mr->desc_size * mr->max_descs,