Merge tag 'hwlock-v5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/andersson...
[linux-2.6-microblaze.git] / drivers / infiniband / sw / siw / siw_verbs.c
1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/uaccess.h>
9 #include <linux/vmalloc.h>
10 #include <linux/xarray.h>
11
12 #include <rdma/iw_cm.h>
13 #include <rdma/ib_verbs.h>
14 #include <rdma/ib_user_verbs.h>
15 #include <rdma/uverbs_ioctl.h>
16
17 #include "siw.h"
18 #include "siw_verbs.h"
19 #include "siw_mem.h"
20
21 static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
22         [IB_QPS_RESET] = SIW_QP_STATE_IDLE,
23         [IB_QPS_INIT] = SIW_QP_STATE_IDLE,
24         [IB_QPS_RTR] = SIW_QP_STATE_RTR,
25         [IB_QPS_RTS] = SIW_QP_STATE_RTS,
26         [IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
27         [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
28         [IB_QPS_ERR] = SIW_QP_STATE_ERROR
29 };
30
31 static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
32         [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
33         [IB_QPS_RTS] = "RTS",     [IB_QPS_SQD] = "SQD",   [IB_QPS_SQE] = "SQE",
34         [IB_QPS_ERR] = "ERR"
35 };
36
37 void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
38 {
39         struct siw_user_mmap_entry *entry = to_siw_mmap_entry(rdma_entry);
40
41         kfree(entry);
42 }
43
44 int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
45 {
46         struct siw_ucontext *uctx = to_siw_ctx(ctx);
47         size_t size = vma->vm_end - vma->vm_start;
48         struct rdma_user_mmap_entry *rdma_entry;
49         struct siw_user_mmap_entry *entry;
50         int rv = -EINVAL;
51
52         /*
53          * Must be page aligned
54          */
55         if (vma->vm_start & (PAGE_SIZE - 1)) {
56                 pr_warn("siw: mmap not page aligned\n");
57                 return -EINVAL;
58         }
59         rdma_entry = rdma_user_mmap_entry_get(&uctx->base_ucontext, vma);
60         if (!rdma_entry) {
61                 siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %#zx\n",
62                         vma->vm_pgoff, size);
63                 return -EINVAL;
64         }
65         entry = to_siw_mmap_entry(rdma_entry);
66
67         rv = remap_vmalloc_range(vma, entry->address, 0);
68         if (rv) {
69                 pr_warn("remap_vmalloc_range failed: %lu, %zu\n", vma->vm_pgoff,
70                         size);
71                 goto out;
72         }
73 out:
74         rdma_user_mmap_entry_put(rdma_entry);
75
76         return rv;
77 }
78
79 int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
80 {
81         struct siw_device *sdev = to_siw_dev(base_ctx->device);
82         struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
83         struct siw_uresp_alloc_ctx uresp = {};
84         int rv;
85
86         if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
87                 rv = -ENOMEM;
88                 goto err_out;
89         }
90         ctx->sdev = sdev;
91
92         uresp.dev_id = sdev->vendor_part_id;
93
94         if (udata->outlen < sizeof(uresp)) {
95                 rv = -EINVAL;
96                 goto err_out;
97         }
98         rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
99         if (rv)
100                 goto err_out;
101
102         siw_dbg(base_ctx->device, "success. now %d context(s)\n",
103                 atomic_read(&sdev->num_ctx));
104
105         return 0;
106
107 err_out:
108         atomic_dec(&sdev->num_ctx);
109         siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
110                 atomic_read(&sdev->num_ctx));
111
112         return rv;
113 }
114
115 void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
116 {
117         struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
118
119         atomic_dec(&uctx->sdev->num_ctx);
120 }
121
122 int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
123                      struct ib_udata *udata)
124 {
125         struct siw_device *sdev = to_siw_dev(base_dev);
126
127         if (udata->inlen || udata->outlen)
128                 return -EINVAL;
129
130         memset(attr, 0, sizeof(*attr));
131
132         /* Revisit atomic caps if RFC 7306 gets supported */
133         attr->atomic_cap = 0;
134         attr->device_cap_flags =
135                 IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG;
136         attr->max_cq = sdev->attrs.max_cq;
137         attr->max_cqe = sdev->attrs.max_cqe;
138         attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
139         attr->max_mr = sdev->attrs.max_mr;
140         attr->max_mw = sdev->attrs.max_mw;
141         attr->max_mr_size = ~0ull;
142         attr->max_pd = sdev->attrs.max_pd;
143         attr->max_qp = sdev->attrs.max_qp;
144         attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
145         attr->max_qp_rd_atom = sdev->attrs.max_ord;
146         attr->max_qp_wr = sdev->attrs.max_qp_wr;
147         attr->max_recv_sge = sdev->attrs.max_sge;
148         attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
149         attr->max_send_sge = sdev->attrs.max_sge;
150         attr->max_sge_rd = sdev->attrs.max_sge_rd;
151         attr->max_srq = sdev->attrs.max_srq;
152         attr->max_srq_sge = sdev->attrs.max_srq_sge;
153         attr->max_srq_wr = sdev->attrs.max_srq_wr;
154         attr->page_size_cap = PAGE_SIZE;
155         attr->vendor_id = SIW_VENDOR_ID;
156         attr->vendor_part_id = sdev->vendor_part_id;
157
158         memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6);
159
160         return 0;
161 }
162
163 int siw_query_port(struct ib_device *base_dev, u32 port,
164                    struct ib_port_attr *attr)
165 {
166         struct siw_device *sdev = to_siw_dev(base_dev);
167         int rv;
168
169         memset(attr, 0, sizeof(*attr));
170
171         rv = ib_get_eth_speed(base_dev, port, &attr->active_speed,
172                          &attr->active_width);
173         attr->gid_tbl_len = 1;
174         attr->max_msg_sz = -1;
175         attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
176         attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
177         attr->phys_state = sdev->state == IB_PORT_ACTIVE ?
178                 IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
179         attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
180         attr->state = sdev->state;
181         /*
182          * All zero
183          *
184          * attr->lid = 0;
185          * attr->bad_pkey_cntr = 0;
186          * attr->qkey_viol_cntr = 0;
187          * attr->sm_lid = 0;
188          * attr->lmc = 0;
189          * attr->max_vl_num = 0;
190          * attr->sm_sl = 0;
191          * attr->subnet_timeout = 0;
192          * attr->init_type_repy = 0;
193          */
194         return rv;
195 }
196
197 int siw_get_port_immutable(struct ib_device *base_dev, u32 port,
198                            struct ib_port_immutable *port_immutable)
199 {
200         struct ib_port_attr attr;
201         int rv = siw_query_port(base_dev, port, &attr);
202
203         if (rv)
204                 return rv;
205
206         port_immutable->gid_tbl_len = attr.gid_tbl_len;
207         port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
208
209         return 0;
210 }
211
212 int siw_query_gid(struct ib_device *base_dev, u32 port, int idx,
213                   union ib_gid *gid)
214 {
215         struct siw_device *sdev = to_siw_dev(base_dev);
216
217         /* subnet_prefix == interface_id == 0; */
218         memset(gid, 0, sizeof(*gid));
219         memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6);
220
221         return 0;
222 }
223
224 int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
225 {
226         struct siw_device *sdev = to_siw_dev(pd->device);
227
228         if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
229                 atomic_dec(&sdev->num_pd);
230                 return -ENOMEM;
231         }
232         siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
233
234         return 0;
235 }
236
237 int siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
238 {
239         struct siw_device *sdev = to_siw_dev(pd->device);
240
241         siw_dbg_pd(pd, "free PD\n");
242         atomic_dec(&sdev->num_pd);
243         return 0;
244 }
245
246 void siw_qp_get_ref(struct ib_qp *base_qp)
247 {
248         siw_qp_get(to_siw_qp(base_qp));
249 }
250
251 void siw_qp_put_ref(struct ib_qp *base_qp)
252 {
253         siw_qp_put(to_siw_qp(base_qp));
254 }
255
256 static struct rdma_user_mmap_entry *
257 siw_mmap_entry_insert(struct siw_ucontext *uctx,
258                       void *address, size_t length,
259                       u64 *offset)
260 {
261         struct siw_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
262         int rv;
263
264         *offset = SIW_INVAL_UOBJ_KEY;
265         if (!entry)
266                 return NULL;
267
268         entry->address = address;
269
270         rv = rdma_user_mmap_entry_insert(&uctx->base_ucontext,
271                                          &entry->rdma_entry,
272                                          length);
273         if (rv) {
274                 kfree(entry);
275                 return NULL;
276         }
277
278         *offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
279
280         return &entry->rdma_entry;
281 }
282
283 /*
284  * siw_create_qp()
285  *
286  * Create QP of requested size on given device.
287  *
288  * @pd:         Protection Domain
289  * @attrs:      Initial QP attributes.
290  * @udata:      used to provide QP ID, SQ and RQ size back to user.
291  */
292
293 struct ib_qp *siw_create_qp(struct ib_pd *pd,
294                             struct ib_qp_init_attr *attrs,
295                             struct ib_udata *udata)
296 {
297         struct siw_qp *qp = NULL;
298         struct ib_device *base_dev = pd->device;
299         struct siw_device *sdev = to_siw_dev(base_dev);
300         struct siw_ucontext *uctx =
301                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
302                                           base_ucontext);
303         struct siw_cq *scq = NULL, *rcq = NULL;
304         unsigned long flags;
305         int num_sqe, num_rqe, rv = 0;
306         size_t length;
307
308         siw_dbg(base_dev, "create new QP\n");
309
310         if (attrs->create_flags)
311                 return ERR_PTR(-EOPNOTSUPP);
312
313         if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
314                 siw_dbg(base_dev, "too many QP's\n");
315                 rv = -ENOMEM;
316                 goto err_out;
317         }
318         if (attrs->qp_type != IB_QPT_RC) {
319                 siw_dbg(base_dev, "only RC QP's supported\n");
320                 rv = -EOPNOTSUPP;
321                 goto err_out;
322         }
323         if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
324             (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
325             (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
326             (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
327                 siw_dbg(base_dev, "QP size error\n");
328                 rv = -EINVAL;
329                 goto err_out;
330         }
331         if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
332                 siw_dbg(base_dev, "max inline send: %d > %d\n",
333                         attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
334                 rv = -EINVAL;
335                 goto err_out;
336         }
337         /*
338          * NOTE: we allow for zero element SQ and RQ WQE's SGL's
339          * but not for a QP unable to hold any WQE (SQ + RQ)
340          */
341         if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) {
342                 siw_dbg(base_dev, "QP must have send or receive queue\n");
343                 rv = -EINVAL;
344                 goto err_out;
345         }
346         scq = to_siw_cq(attrs->send_cq);
347         rcq = to_siw_cq(attrs->recv_cq);
348
349         if (!scq || (!rcq && !attrs->srq)) {
350                 siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
351                 rv = -EINVAL;
352                 goto err_out;
353         }
354         qp = kzalloc(sizeof(*qp), GFP_KERNEL);
355         if (!qp) {
356                 rv = -ENOMEM;
357                 goto err_out;
358         }
359         init_rwsem(&qp->state_lock);
360         spin_lock_init(&qp->sq_lock);
361         spin_lock_init(&qp->rq_lock);
362         spin_lock_init(&qp->orq_lock);
363
364         rv = siw_qp_add(sdev, qp);
365         if (rv)
366                 goto err_out;
367
368         num_sqe = attrs->cap.max_send_wr;
369         num_rqe = attrs->cap.max_recv_wr;
370
371         /* All queue indices are derived from modulo operations
372          * on a free running 'get' (consumer) and 'put' (producer)
373          * unsigned counter. Having queue sizes at power of two
374          * avoids handling counter wrap around.
375          */
376         if (num_sqe)
377                 num_sqe = roundup_pow_of_two(num_sqe);
378         else {
379                 /* Zero sized SQ is not supported */
380                 rv = -EINVAL;
381                 goto err_out;
382         }
383         if (num_rqe)
384                 num_rqe = roundup_pow_of_two(num_rqe);
385
386         if (udata)
387                 qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
388         else
389                 qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe));
390
391         if (qp->sendq == NULL) {
392                 rv = -ENOMEM;
393                 goto err_out_xa;
394         }
395         if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
396                 if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
397                         qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
398                 else {
399                         rv = -EINVAL;
400                         goto err_out_xa;
401                 }
402         }
403         qp->pd = pd;
404         qp->scq = scq;
405         qp->rcq = rcq;
406
407         if (attrs->srq) {
408                 /*
409                  * SRQ support.
410                  * Verbs 6.3.7: ignore RQ size, if SRQ present
411                  * Verbs 6.3.5: do not check PD of SRQ against PD of QP
412                  */
413                 qp->srq = to_siw_srq(attrs->srq);
414                 qp->attrs.rq_size = 0;
415                 siw_dbg(base_dev, "QP [%u]: SRQ attached\n",
416                         qp->base_qp.qp_num);
417         } else if (num_rqe) {
418                 if (udata)
419                         qp->recvq =
420                                 vmalloc_user(num_rqe * sizeof(struct siw_rqe));
421                 else
422                         qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe));
423
424                 if (qp->recvq == NULL) {
425                         rv = -ENOMEM;
426                         goto err_out_xa;
427                 }
428                 qp->attrs.rq_size = num_rqe;
429         }
430         qp->attrs.sq_size = num_sqe;
431         qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
432         qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
433
434         /* Make those two tunables fixed for now. */
435         qp->tx_ctx.gso_seg_limit = 1;
436         qp->tx_ctx.zcopy_tx = zcopy_tx;
437
438         qp->attrs.state = SIW_QP_STATE_IDLE;
439
440         if (udata) {
441                 struct siw_uresp_create_qp uresp = {};
442
443                 uresp.num_sqe = num_sqe;
444                 uresp.num_rqe = num_rqe;
445                 uresp.qp_id = qp_id(qp);
446
447                 if (qp->sendq) {
448                         length = num_sqe * sizeof(struct siw_sqe);
449                         qp->sq_entry =
450                                 siw_mmap_entry_insert(uctx, qp->sendq,
451                                                       length, &uresp.sq_key);
452                         if (!qp->sq_entry) {
453                                 rv = -ENOMEM;
454                                 goto err_out_xa;
455                         }
456                 }
457
458                 if (qp->recvq) {
459                         length = num_rqe * sizeof(struct siw_rqe);
460                         qp->rq_entry =
461                                 siw_mmap_entry_insert(uctx, qp->recvq,
462                                                       length, &uresp.rq_key);
463                         if (!qp->rq_entry) {
464                                 uresp.sq_key = SIW_INVAL_UOBJ_KEY;
465                                 rv = -ENOMEM;
466                                 goto err_out_xa;
467                         }
468                 }
469
470                 if (udata->outlen < sizeof(uresp)) {
471                         rv = -EINVAL;
472                         goto err_out_xa;
473                 }
474                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
475                 if (rv)
476                         goto err_out_xa;
477         }
478         qp->tx_cpu = siw_get_tx_cpu(sdev);
479         if (qp->tx_cpu < 0) {
480                 rv = -EINVAL;
481                 goto err_out_xa;
482         }
483         INIT_LIST_HEAD(&qp->devq);
484         spin_lock_irqsave(&sdev->lock, flags);
485         list_add_tail(&qp->devq, &sdev->qp_list);
486         spin_unlock_irqrestore(&sdev->lock, flags);
487
488         return &qp->base_qp;
489
490 err_out_xa:
491         xa_erase(&sdev->qp_xa, qp_id(qp));
492 err_out:
493         if (qp) {
494                 if (uctx) {
495                         rdma_user_mmap_entry_remove(qp->sq_entry);
496                         rdma_user_mmap_entry_remove(qp->rq_entry);
497                 }
498                 vfree(qp->sendq);
499                 vfree(qp->recvq);
500                 kfree(qp);
501         }
502         atomic_dec(&sdev->num_qp);
503
504         return ERR_PTR(rv);
505 }
506
507 /*
508  * Minimum siw_query_qp() verb interface.
509  *
510  * @qp_attr_mask is not used but all available information is provided
511  */
512 int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
513                  int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
514 {
515         struct siw_qp *qp;
516         struct siw_device *sdev;
517
518         if (base_qp && qp_attr && qp_init_attr) {
519                 qp = to_siw_qp(base_qp);
520                 sdev = to_siw_dev(base_qp->device);
521         } else {
522                 return -EINVAL;
523         }
524         qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
525         qp_attr->cap.max_send_wr = qp->attrs.sq_size;
526         qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
527         qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
528         qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
529         qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
530         qp_attr->max_rd_atomic = qp->attrs.irq_size;
531         qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
532
533         qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
534                                    IB_ACCESS_REMOTE_WRITE |
535                                    IB_ACCESS_REMOTE_READ;
536
537         qp_init_attr->qp_type = base_qp->qp_type;
538         qp_init_attr->send_cq = base_qp->send_cq;
539         qp_init_attr->recv_cq = base_qp->recv_cq;
540         qp_init_attr->srq = base_qp->srq;
541
542         qp_init_attr->cap = qp_attr->cap;
543
544         return 0;
545 }
546
547 int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
548                         int attr_mask, struct ib_udata *udata)
549 {
550         struct siw_qp_attrs new_attrs;
551         enum siw_qp_attr_mask siw_attr_mask = 0;
552         struct siw_qp *qp = to_siw_qp(base_qp);
553         int rv = 0;
554
555         if (!attr_mask)
556                 return 0;
557
558         if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
559                 return -EOPNOTSUPP;
560
561         memset(&new_attrs, 0, sizeof(new_attrs));
562
563         if (attr_mask & IB_QP_ACCESS_FLAGS) {
564                 siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
565
566                 if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
567                         new_attrs.flags |= SIW_RDMA_READ_ENABLED;
568                 if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
569                         new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
570                 if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
571                         new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
572         }
573         if (attr_mask & IB_QP_STATE) {
574                 siw_dbg_qp(qp, "desired IB QP state: %s\n",
575                            ib_qp_state_to_string[attr->qp_state]);
576
577                 new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
578
579                 if (new_attrs.state > SIW_QP_STATE_RTS)
580                         qp->tx_ctx.tx_suspend = 1;
581
582                 siw_attr_mask |= SIW_QP_ATTR_STATE;
583         }
584         if (!siw_attr_mask)
585                 goto out;
586
587         down_write(&qp->state_lock);
588
589         rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
590
591         up_write(&qp->state_lock);
592 out:
593         return rv;
594 }
595
596 int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
597 {
598         struct siw_qp *qp = to_siw_qp(base_qp);
599         struct siw_ucontext *uctx =
600                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
601                                           base_ucontext);
602         struct siw_qp_attrs qp_attrs;
603
604         siw_dbg_qp(qp, "state %d\n", qp->attrs.state);
605
606         /*
607          * Mark QP as in process of destruction to prevent from
608          * any async callbacks to RDMA core
609          */
610         qp->attrs.flags |= SIW_QP_IN_DESTROY;
611         qp->rx_stream.rx_suspend = 1;
612
613         if (uctx) {
614                 rdma_user_mmap_entry_remove(qp->sq_entry);
615                 rdma_user_mmap_entry_remove(qp->rq_entry);
616         }
617
618         down_write(&qp->state_lock);
619
620         qp_attrs.state = SIW_QP_STATE_ERROR;
621         siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
622
623         if (qp->cep) {
624                 siw_cep_put(qp->cep);
625                 qp->cep = NULL;
626         }
627         up_write(&qp->state_lock);
628
629         kfree(qp->tx_ctx.mpa_crc_hd);
630         kfree(qp->rx_stream.mpa_crc_hd);
631
632         qp->scq = qp->rcq = NULL;
633
634         siw_qp_put(qp);
635
636         return 0;
637 }
638
639 /*
640  * siw_copy_inline_sgl()
641  *
642  * Prepare sgl of inlined data for sending. For userland callers
643  * function checks if given buffer addresses and len's are within
644  * process context bounds.
645  * Data from all provided sge's are copied together into the wqe,
646  * referenced by a single sge.
647  */
648 static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
649                                struct siw_sqe *sqe)
650 {
651         struct ib_sge *core_sge = core_wr->sg_list;
652         void *kbuf = &sqe->sge[1];
653         int num_sge = core_wr->num_sge, bytes = 0;
654
655         sqe->sge[0].laddr = (uintptr_t)kbuf;
656         sqe->sge[0].lkey = 0;
657
658         while (num_sge--) {
659                 if (!core_sge->length) {
660                         core_sge++;
661                         continue;
662                 }
663                 bytes += core_sge->length;
664                 if (bytes > SIW_MAX_INLINE) {
665                         bytes = -EINVAL;
666                         break;
667                 }
668                 memcpy(kbuf, (void *)(uintptr_t)core_sge->addr,
669                        core_sge->length);
670
671                 kbuf += core_sge->length;
672                 core_sge++;
673         }
674         sqe->sge[0].length = bytes > 0 ? bytes : 0;
675         sqe->num_sge = bytes > 0 ? 1 : 0;
676
677         return bytes;
678 }
679
680 /* Complete SQ WR's without processing */
681 static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
682                            const struct ib_send_wr **bad_wr)
683 {
684         struct siw_sqe sqe = {};
685         int rv = 0;
686
687         while (wr) {
688                 sqe.id = wr->wr_id;
689                 sqe.opcode = wr->opcode;
690                 rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR);
691                 if (rv) {
692                         if (bad_wr)
693                                 *bad_wr = wr;
694                         break;
695                 }
696                 wr = wr->next;
697         }
698         return rv;
699 }
700
701 /* Complete RQ WR's without processing */
702 static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr,
703                            const struct ib_recv_wr **bad_wr)
704 {
705         struct siw_rqe rqe = {};
706         int rv = 0;
707
708         while (wr) {
709                 rqe.id = wr->wr_id;
710                 rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR);
711                 if (rv) {
712                         if (bad_wr)
713                                 *bad_wr = wr;
714                         break;
715                 }
716                 wr = wr->next;
717         }
718         return rv;
719 }
720
721 /*
722  * siw_post_send()
723  *
724  * Post a list of S-WR's to a SQ.
725  *
726  * @base_qp:    Base QP contained in siw QP
727  * @wr:         Null terminated list of user WR's
728  * @bad_wr:     Points to failing WR in case of synchronous failure.
729  */
730 int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
731                   const struct ib_send_wr **bad_wr)
732 {
733         struct siw_qp *qp = to_siw_qp(base_qp);
734         struct siw_wqe *wqe = tx_wqe(qp);
735
736         unsigned long flags;
737         int rv = 0;
738
739         if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) {
740                 siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
741                 *bad_wr = wr;
742                 return -EINVAL;
743         }
744
745         /*
746          * Try to acquire QP state lock. Must be non-blocking
747          * to accommodate kernel clients needs.
748          */
749         if (!down_read_trylock(&qp->state_lock)) {
750                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
751                         /*
752                          * ERROR state is final, so we can be sure
753                          * this state will not change as long as the QP
754                          * exists.
755                          *
756                          * This handles an ib_drain_sq() call with
757                          * a concurrent request to set the QP state
758                          * to ERROR.
759                          */
760                         rv = siw_sq_flush_wr(qp, wr, bad_wr);
761                 } else {
762                         siw_dbg_qp(qp, "QP locked, state %d\n",
763                                    qp->attrs.state);
764                         *bad_wr = wr;
765                         rv = -ENOTCONN;
766                 }
767                 return rv;
768         }
769         if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
770                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
771                         /*
772                          * Immediately flush this WR to CQ, if QP
773                          * is in ERROR state. SQ is guaranteed to
774                          * be empty, so WR complets in-order.
775                          *
776                          * Typically triggered by ib_drain_sq().
777                          */
778                         rv = siw_sq_flush_wr(qp, wr, bad_wr);
779                 } else {
780                         siw_dbg_qp(qp, "QP out of state %d\n",
781                                    qp->attrs.state);
782                         *bad_wr = wr;
783                         rv = -ENOTCONN;
784                 }
785                 up_read(&qp->state_lock);
786                 return rv;
787         }
788         spin_lock_irqsave(&qp->sq_lock, flags);
789
790         while (wr) {
791                 u32 idx = qp->sq_put % qp->attrs.sq_size;
792                 struct siw_sqe *sqe = &qp->sendq[idx];
793
794                 if (sqe->flags) {
795                         siw_dbg_qp(qp, "sq full\n");
796                         rv = -ENOMEM;
797                         break;
798                 }
799                 if (wr->num_sge > qp->attrs.sq_max_sges) {
800                         siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
801                         rv = -EINVAL;
802                         break;
803                 }
804                 sqe->id = wr->wr_id;
805
806                 if ((wr->send_flags & IB_SEND_SIGNALED) ||
807                     (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
808                         sqe->flags |= SIW_WQE_SIGNALLED;
809
810                 if (wr->send_flags & IB_SEND_FENCE)
811                         sqe->flags |= SIW_WQE_READ_FENCE;
812
813                 switch (wr->opcode) {
814                 case IB_WR_SEND:
815                 case IB_WR_SEND_WITH_INV:
816                         if (wr->send_flags & IB_SEND_SOLICITED)
817                                 sqe->flags |= SIW_WQE_SOLICITED;
818
819                         if (!(wr->send_flags & IB_SEND_INLINE)) {
820                                 siw_copy_sgl(wr->sg_list, sqe->sge,
821                                              wr->num_sge);
822                                 sqe->num_sge = wr->num_sge;
823                         } else {
824                                 rv = siw_copy_inline_sgl(wr, sqe);
825                                 if (rv <= 0) {
826                                         rv = -EINVAL;
827                                         break;
828                                 }
829                                 sqe->flags |= SIW_WQE_INLINE;
830                                 sqe->num_sge = 1;
831                         }
832                         if (wr->opcode == IB_WR_SEND)
833                                 sqe->opcode = SIW_OP_SEND;
834                         else {
835                                 sqe->opcode = SIW_OP_SEND_REMOTE_INV;
836                                 sqe->rkey = wr->ex.invalidate_rkey;
837                         }
838                         break;
839
840                 case IB_WR_RDMA_READ_WITH_INV:
841                 case IB_WR_RDMA_READ:
842                         /*
843                          * iWarp restricts RREAD sink to SGL containing
844                          * 1 SGE only. we could relax to SGL with multiple
845                          * elements referring the SAME ltag or even sending
846                          * a private per-rreq tag referring to a checked
847                          * local sgl with MULTIPLE ltag's.
848                          */
849                         if (unlikely(wr->num_sge != 1)) {
850                                 rv = -EINVAL;
851                                 break;
852                         }
853                         siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
854                         /*
855                          * NOTE: zero length RREAD is allowed!
856                          */
857                         sqe->raddr = rdma_wr(wr)->remote_addr;
858                         sqe->rkey = rdma_wr(wr)->rkey;
859                         sqe->num_sge = 1;
860
861                         if (wr->opcode == IB_WR_RDMA_READ)
862                                 sqe->opcode = SIW_OP_READ;
863                         else
864                                 sqe->opcode = SIW_OP_READ_LOCAL_INV;
865                         break;
866
867                 case IB_WR_RDMA_WRITE:
868                         if (!(wr->send_flags & IB_SEND_INLINE)) {
869                                 siw_copy_sgl(wr->sg_list, &sqe->sge[0],
870                                              wr->num_sge);
871                                 sqe->num_sge = wr->num_sge;
872                         } else {
873                                 rv = siw_copy_inline_sgl(wr, sqe);
874                                 if (unlikely(rv < 0)) {
875                                         rv = -EINVAL;
876                                         break;
877                                 }
878                                 sqe->flags |= SIW_WQE_INLINE;
879                                 sqe->num_sge = 1;
880                         }
881                         sqe->raddr = rdma_wr(wr)->remote_addr;
882                         sqe->rkey = rdma_wr(wr)->rkey;
883                         sqe->opcode = SIW_OP_WRITE;
884                         break;
885
886                 case IB_WR_REG_MR:
887                         sqe->base_mr = (uintptr_t)reg_wr(wr)->mr;
888                         sqe->rkey = reg_wr(wr)->key;
889                         sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
890                         sqe->opcode = SIW_OP_REG_MR;
891                         break;
892
893                 case IB_WR_LOCAL_INV:
894                         sqe->rkey = wr->ex.invalidate_rkey;
895                         sqe->opcode = SIW_OP_INVAL_STAG;
896                         break;
897
898                 default:
899                         siw_dbg_qp(qp, "ib wr type %d unsupported\n",
900                                    wr->opcode);
901                         rv = -EINVAL;
902                         break;
903                 }
904                 siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
905                            sqe->opcode, sqe->flags,
906                            (void *)(uintptr_t)sqe->id);
907
908                 if (unlikely(rv < 0))
909                         break;
910
911                 /* make SQE only valid after completely written */
912                 smp_wmb();
913                 sqe->flags |= SIW_WQE_VALID;
914
915                 qp->sq_put++;
916                 wr = wr->next;
917         }
918
919         /*
920          * Send directly if SQ processing is not in progress.
921          * Eventual immediate errors (rv < 0) do not affect the involved
922          * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
923          * processing, if new work is already pending. But rv must be passed
924          * to caller.
925          */
926         if (wqe->wr_status != SIW_WR_IDLE) {
927                 spin_unlock_irqrestore(&qp->sq_lock, flags);
928                 goto skip_direct_sending;
929         }
930         rv = siw_activate_tx(qp);
931         spin_unlock_irqrestore(&qp->sq_lock, flags);
932
933         if (rv <= 0)
934                 goto skip_direct_sending;
935
936         if (rdma_is_kernel_res(&qp->base_qp.res)) {
937                 rv = siw_sq_start(qp);
938         } else {
939                 qp->tx_ctx.in_syscall = 1;
940
941                 if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
942                         siw_qp_cm_drop(qp, 0);
943
944                 qp->tx_ctx.in_syscall = 0;
945         }
946 skip_direct_sending:
947
948         up_read(&qp->state_lock);
949
950         if (rv >= 0)
951                 return 0;
952         /*
953          * Immediate error
954          */
955         siw_dbg_qp(qp, "error %d\n", rv);
956
957         *bad_wr = wr;
958         return rv;
959 }
960
961 /*
962  * siw_post_receive()
963  *
964  * Post a list of R-WR's to a RQ.
965  *
966  * @base_qp:    Base QP contained in siw QP
967  * @wr:         Null terminated list of user WR's
968  * @bad_wr:     Points to failing WR in case of synchronous failure.
969  */
970 int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
971                      const struct ib_recv_wr **bad_wr)
972 {
973         struct siw_qp *qp = to_siw_qp(base_qp);
974         unsigned long flags;
975         int rv = 0;
976
977         if (qp->srq || qp->attrs.rq_size == 0) {
978                 *bad_wr = wr;
979                 return -EINVAL;
980         }
981         if (!rdma_is_kernel_res(&qp->base_qp.res)) {
982                 siw_dbg_qp(qp, "no kernel post_recv for user mapped rq\n");
983                 *bad_wr = wr;
984                 return -EINVAL;
985         }
986
987         /*
988          * Try to acquire QP state lock. Must be non-blocking
989          * to accommodate kernel clients needs.
990          */
991         if (!down_read_trylock(&qp->state_lock)) {
992                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
993                         /*
994                          * ERROR state is final, so we can be sure
995                          * this state will not change as long as the QP
996                          * exists.
997                          *
998                          * This handles an ib_drain_rq() call with
999                          * a concurrent request to set the QP state
1000                          * to ERROR.
1001                          */
1002                         rv = siw_rq_flush_wr(qp, wr, bad_wr);
1003                 } else {
1004                         siw_dbg_qp(qp, "QP locked, state %d\n",
1005                                    qp->attrs.state);
1006                         *bad_wr = wr;
1007                         rv = -ENOTCONN;
1008                 }
1009                 return rv;
1010         }
1011         if (qp->attrs.state > SIW_QP_STATE_RTS) {
1012                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
1013                         /*
1014                          * Immediately flush this WR to CQ, if QP
1015                          * is in ERROR state. RQ is guaranteed to
1016                          * be empty, so WR complets in-order.
1017                          *
1018                          * Typically triggered by ib_drain_rq().
1019                          */
1020                         rv = siw_rq_flush_wr(qp, wr, bad_wr);
1021                 } else {
1022                         siw_dbg_qp(qp, "QP out of state %d\n",
1023                                    qp->attrs.state);
1024                         *bad_wr = wr;
1025                         rv = -ENOTCONN;
1026                 }
1027                 up_read(&qp->state_lock);
1028                 return rv;
1029         }
1030         /*
1031          * Serialize potentially multiple producers.
1032          * Not needed for single threaded consumer side.
1033          */
1034         spin_lock_irqsave(&qp->rq_lock, flags);
1035
1036         while (wr) {
1037                 u32 idx = qp->rq_put % qp->attrs.rq_size;
1038                 struct siw_rqe *rqe = &qp->recvq[idx];
1039
1040                 if (rqe->flags) {
1041                         siw_dbg_qp(qp, "RQ full\n");
1042                         rv = -ENOMEM;
1043                         break;
1044                 }
1045                 if (wr->num_sge > qp->attrs.rq_max_sges) {
1046                         siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
1047                         rv = -EINVAL;
1048                         break;
1049                 }
1050                 rqe->id = wr->wr_id;
1051                 rqe->num_sge = wr->num_sge;
1052                 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1053
1054                 /* make sure RQE is completely written before valid */
1055                 smp_wmb();
1056
1057                 rqe->flags = SIW_WQE_VALID;
1058
1059                 qp->rq_put++;
1060                 wr = wr->next;
1061         }
1062         spin_unlock_irqrestore(&qp->rq_lock, flags);
1063
1064         up_read(&qp->state_lock);
1065
1066         if (rv < 0) {
1067                 siw_dbg_qp(qp, "error %d\n", rv);
1068                 *bad_wr = wr;
1069         }
1070         return rv > 0 ? 0 : rv;
1071 }
1072
1073 int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
1074 {
1075         struct siw_cq *cq = to_siw_cq(base_cq);
1076         struct siw_device *sdev = to_siw_dev(base_cq->device);
1077         struct siw_ucontext *ctx =
1078                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1079                                           base_ucontext);
1080
1081         siw_dbg_cq(cq, "free CQ resources\n");
1082
1083         siw_cq_flush(cq);
1084
1085         if (ctx)
1086                 rdma_user_mmap_entry_remove(cq->cq_entry);
1087
1088         atomic_dec(&sdev->num_cq);
1089
1090         vfree(cq->queue);
1091         return 0;
1092 }
1093
1094 /*
1095  * siw_create_cq()
1096  *
1097  * Populate CQ of requested size
1098  *
1099  * @base_cq: CQ as allocated by RDMA midlayer
1100  * @attr: Initial CQ attributes
1101  * @udata: relates to user context
1102  */
1103
1104 int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
1105                   struct ib_udata *udata)
1106 {
1107         struct siw_device *sdev = to_siw_dev(base_cq->device);
1108         struct siw_cq *cq = to_siw_cq(base_cq);
1109         int rv, size = attr->cqe;
1110
1111         if (attr->flags)
1112                 return -EOPNOTSUPP;
1113
1114         if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
1115                 siw_dbg(base_cq->device, "too many CQ's\n");
1116                 rv = -ENOMEM;
1117                 goto err_out;
1118         }
1119         if (size < 1 || size > sdev->attrs.max_cqe) {
1120                 siw_dbg(base_cq->device, "CQ size error: %d\n", size);
1121                 rv = -EINVAL;
1122                 goto err_out;
1123         }
1124         size = roundup_pow_of_two(size);
1125         cq->base_cq.cqe = size;
1126         cq->num_cqe = size;
1127
1128         if (udata)
1129                 cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
1130                                          sizeof(struct siw_cq_ctrl));
1131         else
1132                 cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
1133                                     sizeof(struct siw_cq_ctrl));
1134
1135         if (cq->queue == NULL) {
1136                 rv = -ENOMEM;
1137                 goto err_out;
1138         }
1139         get_random_bytes(&cq->id, 4);
1140         siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
1141
1142         spin_lock_init(&cq->lock);
1143
1144         cq->notify = (struct siw_cq_ctrl *)&cq->queue[size];
1145
1146         if (udata) {
1147                 struct siw_uresp_create_cq uresp = {};
1148                 struct siw_ucontext *ctx =
1149                         rdma_udata_to_drv_context(udata, struct siw_ucontext,
1150                                                   base_ucontext);
1151                 size_t length = size * sizeof(struct siw_cqe) +
1152                         sizeof(struct siw_cq_ctrl);
1153
1154                 cq->cq_entry =
1155                         siw_mmap_entry_insert(ctx, cq->queue,
1156                                               length, &uresp.cq_key);
1157                 if (!cq->cq_entry) {
1158                         rv = -ENOMEM;
1159                         goto err_out;
1160                 }
1161
1162                 uresp.cq_id = cq->id;
1163                 uresp.num_cqe = size;
1164
1165                 if (udata->outlen < sizeof(uresp)) {
1166                         rv = -EINVAL;
1167                         goto err_out;
1168                 }
1169                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1170                 if (rv)
1171                         goto err_out;
1172         }
1173         return 0;
1174
1175 err_out:
1176         siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
1177
1178         if (cq && cq->queue) {
1179                 struct siw_ucontext *ctx =
1180                         rdma_udata_to_drv_context(udata, struct siw_ucontext,
1181                                                   base_ucontext);
1182                 if (ctx)
1183                         rdma_user_mmap_entry_remove(cq->cq_entry);
1184                 vfree(cq->queue);
1185         }
1186         atomic_dec(&sdev->num_cq);
1187
1188         return rv;
1189 }
1190
1191 /*
1192  * siw_poll_cq()
1193  *
1194  * Reap CQ entries if available and copy work completion status into
1195  * array of WC's provided by caller. Returns number of reaped CQE's.
1196  *
1197  * @base_cq:    Base CQ contained in siw CQ.
1198  * @num_cqe:    Maximum number of CQE's to reap.
1199  * @wc:         Array of work completions to be filled by siw.
1200  */
1201 int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
1202 {
1203         struct siw_cq *cq = to_siw_cq(base_cq);
1204         int i;
1205
1206         for (i = 0; i < num_cqe; i++) {
1207                 if (!siw_reap_cqe(cq, wc))
1208                         break;
1209                 wc++;
1210         }
1211         return i;
1212 }
1213
1214 /*
1215  * siw_req_notify_cq()
1216  *
1217  * Request notification for new CQE's added to that CQ.
1218  * Defined flags:
1219  * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
1220  *   event if a WQE with notification flag set enters the CQ
1221  * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
1222  *   event if a WQE enters the CQ.
1223  * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
1224  *   number of not reaped CQE's regardless of its notification
1225  *   type and current or new CQ notification settings.
1226  *
1227  * @base_cq:    Base CQ contained in siw CQ.
1228  * @flags:      Requested notification flags.
1229  */
1230 int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
1231 {
1232         struct siw_cq *cq = to_siw_cq(base_cq);
1233
1234         siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
1235
1236         if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
1237                 /*
1238                  * Enable CQ event for next solicited completion.
1239                  * and make it visible to all associated producers.
1240                  */
1241                 smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED);
1242         else
1243                 /*
1244                  * Enable CQ event for any signalled completion.
1245                  * and make it visible to all associated producers.
1246                  */
1247                 smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL);
1248
1249         if (flags & IB_CQ_REPORT_MISSED_EVENTS)
1250                 return cq->cq_put - cq->cq_get;
1251
1252         return 0;
1253 }
1254
1255 /*
1256  * siw_dereg_mr()
1257  *
1258  * Release Memory Region.
1259  *
1260  * @base_mr: Base MR contained in siw MR.
1261  * @udata: points to user context, unused.
1262  */
1263 int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
1264 {
1265         struct siw_mr *mr = to_siw_mr(base_mr);
1266         struct siw_device *sdev = to_siw_dev(base_mr->device);
1267
1268         siw_dbg_mem(mr->mem, "deregister MR\n");
1269
1270         atomic_dec(&sdev->num_mr);
1271
1272         siw_mr_drop_mem(mr);
1273         kfree_rcu(mr, rcu);
1274
1275         return 0;
1276 }
1277
1278 /*
1279  * siw_reg_user_mr()
1280  *
1281  * Register Memory Region.
1282  *
1283  * @pd:         Protection Domain
1284  * @start:      starting address of MR (virtual address)
1285  * @len:        len of MR
1286  * @rnic_va:    not used by siw
1287  * @rights:     MR access rights
1288  * @udata:      user buffer to communicate STag and Key.
1289  */
1290 struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
1291                               u64 rnic_va, int rights, struct ib_udata *udata)
1292 {
1293         struct siw_mr *mr = NULL;
1294         struct siw_umem *umem = NULL;
1295         struct siw_ureq_reg_mr ureq;
1296         struct siw_device *sdev = to_siw_dev(pd->device);
1297
1298         unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
1299         int rv;
1300
1301         siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
1302                    (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
1303                    (unsigned long long)len);
1304
1305         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1306                 siw_dbg_pd(pd, "too many mr's\n");
1307                 rv = -ENOMEM;
1308                 goto err_out;
1309         }
1310         if (!len) {
1311                 rv = -EINVAL;
1312                 goto err_out;
1313         }
1314         if (mem_limit != RLIM_INFINITY) {
1315                 unsigned long num_pages =
1316                         (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
1317                 mem_limit >>= PAGE_SHIFT;
1318
1319                 if (num_pages > mem_limit - current->mm->locked_vm) {
1320                         siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
1321                                    num_pages, mem_limit,
1322                                    current->mm->locked_vm);
1323                         rv = -ENOMEM;
1324                         goto err_out;
1325                 }
1326         }
1327         umem = siw_umem_get(start, len, ib_access_writable(rights));
1328         if (IS_ERR(umem)) {
1329                 rv = PTR_ERR(umem);
1330                 siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
1331                 umem = NULL;
1332                 goto err_out;
1333         }
1334         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1335         if (!mr) {
1336                 rv = -ENOMEM;
1337                 goto err_out;
1338         }
1339         rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
1340         if (rv)
1341                 goto err_out;
1342
1343         if (udata) {
1344                 struct siw_uresp_reg_mr uresp = {};
1345                 struct siw_mem *mem = mr->mem;
1346
1347                 if (udata->inlen < sizeof(ureq)) {
1348                         rv = -EINVAL;
1349                         goto err_out;
1350                 }
1351                 rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
1352                 if (rv)
1353                         goto err_out;
1354
1355                 mr->base_mr.lkey |= ureq.stag_key;
1356                 mr->base_mr.rkey |= ureq.stag_key;
1357                 mem->stag |= ureq.stag_key;
1358                 uresp.stag = mem->stag;
1359
1360                 if (udata->outlen < sizeof(uresp)) {
1361                         rv = -EINVAL;
1362                         goto err_out;
1363                 }
1364                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1365                 if (rv)
1366                         goto err_out;
1367         }
1368         mr->mem->stag_valid = 1;
1369
1370         return &mr->base_mr;
1371
1372 err_out:
1373         atomic_dec(&sdev->num_mr);
1374         if (mr) {
1375                 if (mr->mem)
1376                         siw_mr_drop_mem(mr);
1377                 kfree_rcu(mr, rcu);
1378         } else {
1379                 if (umem)
1380                         siw_umem_release(umem, false);
1381         }
1382         return ERR_PTR(rv);
1383 }
1384
1385 struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
1386                            u32 max_sge)
1387 {
1388         struct siw_device *sdev = to_siw_dev(pd->device);
1389         struct siw_mr *mr = NULL;
1390         struct siw_pbl *pbl = NULL;
1391         int rv;
1392
1393         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1394                 siw_dbg_pd(pd, "too many mr's\n");
1395                 rv = -ENOMEM;
1396                 goto err_out;
1397         }
1398         if (mr_type != IB_MR_TYPE_MEM_REG) {
1399                 siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
1400                 rv = -EOPNOTSUPP;
1401                 goto err_out;
1402         }
1403         if (max_sge > SIW_MAX_SGE_PBL) {
1404                 siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
1405                 rv = -ENOMEM;
1406                 goto err_out;
1407         }
1408         pbl = siw_pbl_alloc(max_sge);
1409         if (IS_ERR(pbl)) {
1410                 rv = PTR_ERR(pbl);
1411                 siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
1412                 pbl = NULL;
1413                 goto err_out;
1414         }
1415         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1416         if (!mr) {
1417                 rv = -ENOMEM;
1418                 goto err_out;
1419         }
1420         rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
1421         if (rv)
1422                 goto err_out;
1423
1424         mr->mem->is_pbl = 1;
1425
1426         siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1427
1428         return &mr->base_mr;
1429
1430 err_out:
1431         atomic_dec(&sdev->num_mr);
1432
1433         if (!mr) {
1434                 kfree(pbl);
1435         } else {
1436                 if (mr->mem)
1437                         siw_mr_drop_mem(mr);
1438                 kfree_rcu(mr, rcu);
1439         }
1440         siw_dbg_pd(pd, "failed: %d\n", rv);
1441
1442         return ERR_PTR(rv);
1443 }
1444
1445 /* Just used to count number of pages being mapped */
1446 static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
1447 {
1448         return 0;
1449 }
1450
1451 int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
1452                   unsigned int *sg_off)
1453 {
1454         struct scatterlist *slp;
1455         struct siw_mr *mr = to_siw_mr(base_mr);
1456         struct siw_mem *mem = mr->mem;
1457         struct siw_pbl *pbl = mem->pbl;
1458         struct siw_pble *pble;
1459         unsigned long pbl_size;
1460         int i, rv;
1461
1462         if (!pbl) {
1463                 siw_dbg_mem(mem, "no PBL allocated\n");
1464                 return -EINVAL;
1465         }
1466         pble = pbl->pbe;
1467
1468         if (pbl->max_buf < num_sle) {
1469                 siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
1470                             mem->pbl->max_buf, num_sle);
1471                 return -ENOMEM;
1472         }
1473         for_each_sg(sl, slp, num_sle, i) {
1474                 if (sg_dma_len(slp) == 0) {
1475                         siw_dbg_mem(mem, "empty SGE\n");
1476                         return -EINVAL;
1477                 }
1478                 if (i == 0) {
1479                         pble->addr = sg_dma_address(slp);
1480                         pble->size = sg_dma_len(slp);
1481                         pble->pbl_off = 0;
1482                         pbl_size = pble->size;
1483                         pbl->num_buf = 1;
1484                 } else {
1485                         /* Merge PBL entries if adjacent */
1486                         if (pble->addr + pble->size == sg_dma_address(slp)) {
1487                                 pble->size += sg_dma_len(slp);
1488                         } else {
1489                                 pble++;
1490                                 pbl->num_buf++;
1491                                 pble->addr = sg_dma_address(slp);
1492                                 pble->size = sg_dma_len(slp);
1493                                 pble->pbl_off = pbl_size;
1494                         }
1495                         pbl_size += sg_dma_len(slp);
1496                 }
1497                 siw_dbg_mem(mem,
1498                         "sge[%d], size %u, addr 0x%p, total %lu\n",
1499                         i, pble->size, (void *)(uintptr_t)pble->addr,
1500                         pbl_size);
1501         }
1502         rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
1503         if (rv > 0) {
1504                 mem->len = base_mr->length;
1505                 mem->va = base_mr->iova;
1506                 siw_dbg_mem(mem,
1507                         "%llu bytes, start 0x%pK, %u SLE to %u entries\n",
1508                         mem->len, (void *)(uintptr_t)mem->va, num_sle,
1509                         pbl->num_buf);
1510         }
1511         return rv;
1512 }
1513
1514 /*
1515  * siw_get_dma_mr()
1516  *
1517  * Create a (empty) DMA memory region, where no umem is attached.
1518  */
1519 struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
1520 {
1521         struct siw_device *sdev = to_siw_dev(pd->device);
1522         struct siw_mr *mr = NULL;
1523         int rv;
1524
1525         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1526                 siw_dbg_pd(pd, "too many mr's\n");
1527                 rv = -ENOMEM;
1528                 goto err_out;
1529         }
1530         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1531         if (!mr) {
1532                 rv = -ENOMEM;
1533                 goto err_out;
1534         }
1535         rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
1536         if (rv)
1537                 goto err_out;
1538
1539         mr->mem->stag_valid = 1;
1540
1541         siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1542
1543         return &mr->base_mr;
1544
1545 err_out:
1546         if (rv)
1547                 kfree(mr);
1548
1549         atomic_dec(&sdev->num_mr);
1550
1551         return ERR_PTR(rv);
1552 }
1553
1554 /*
1555  * siw_create_srq()
1556  *
1557  * Create Shared Receive Queue of attributes @init_attrs
1558  * within protection domain given by @pd.
1559  *
1560  * @base_srq:   Base SRQ contained in siw SRQ.
1561  * @init_attrs: SRQ init attributes.
1562  * @udata:      points to user context
1563  */
1564 int siw_create_srq(struct ib_srq *base_srq,
1565                    struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
1566 {
1567         struct siw_srq *srq = to_siw_srq(base_srq);
1568         struct ib_srq_attr *attrs = &init_attrs->attr;
1569         struct siw_device *sdev = to_siw_dev(base_srq->device);
1570         struct siw_ucontext *ctx =
1571                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1572                                           base_ucontext);
1573         int rv;
1574
1575         if (init_attrs->srq_type != IB_SRQT_BASIC)
1576                 return -EOPNOTSUPP;
1577
1578         if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
1579                 siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
1580                 rv = -ENOMEM;
1581                 goto err_out;
1582         }
1583         if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
1584             attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
1585                 rv = -EINVAL;
1586                 goto err_out;
1587         }
1588         srq->max_sge = attrs->max_sge;
1589         srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
1590         srq->limit = attrs->srq_limit;
1591         if (srq->limit)
1592                 srq->armed = true;
1593
1594         srq->is_kernel_res = !udata;
1595
1596         if (udata)
1597                 srq->recvq =
1598                         vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
1599         else
1600                 srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe));
1601
1602         if (srq->recvq == NULL) {
1603                 rv = -ENOMEM;
1604                 goto err_out;
1605         }
1606         if (udata) {
1607                 struct siw_uresp_create_srq uresp = {};
1608                 size_t length = srq->num_rqe * sizeof(struct siw_rqe);
1609
1610                 srq->srq_entry =
1611                         siw_mmap_entry_insert(ctx, srq->recvq,
1612                                               length, &uresp.srq_key);
1613                 if (!srq->srq_entry) {
1614                         rv = -ENOMEM;
1615                         goto err_out;
1616                 }
1617
1618                 uresp.num_rqe = srq->num_rqe;
1619
1620                 if (udata->outlen < sizeof(uresp)) {
1621                         rv = -EINVAL;
1622                         goto err_out;
1623                 }
1624                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1625                 if (rv)
1626                         goto err_out;
1627         }
1628         spin_lock_init(&srq->lock);
1629
1630         siw_dbg_pd(base_srq->pd, "[SRQ]: success\n");
1631
1632         return 0;
1633
1634 err_out:
1635         if (srq->recvq) {
1636                 if (ctx)
1637                         rdma_user_mmap_entry_remove(srq->srq_entry);
1638                 vfree(srq->recvq);
1639         }
1640         atomic_dec(&sdev->num_srq);
1641
1642         return rv;
1643 }
1644
1645 /*
1646  * siw_modify_srq()
1647  *
1648  * Modify SRQ. The caller may resize SRQ and/or set/reset notification
1649  * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
1650  *
1651  * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
1652  * parameter. siw_modify_srq() does not check the attrs->max_sge param.
1653  */
1654 int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
1655                    enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
1656 {
1657         struct siw_srq *srq = to_siw_srq(base_srq);
1658         unsigned long flags;
1659         int rv = 0;
1660
1661         spin_lock_irqsave(&srq->lock, flags);
1662
1663         if (attr_mask & IB_SRQ_MAX_WR) {
1664                 /* resize request not yet supported */
1665                 rv = -EOPNOTSUPP;
1666                 goto out;
1667         }
1668         if (attr_mask & IB_SRQ_LIMIT) {
1669                 if (attrs->srq_limit) {
1670                         if (unlikely(attrs->srq_limit > srq->num_rqe)) {
1671                                 rv = -EINVAL;
1672                                 goto out;
1673                         }
1674                         srq->armed = true;
1675                 } else {
1676                         srq->armed = false;
1677                 }
1678                 srq->limit = attrs->srq_limit;
1679         }
1680 out:
1681         spin_unlock_irqrestore(&srq->lock, flags);
1682
1683         return rv;
1684 }
1685
1686 /*
1687  * siw_query_srq()
1688  *
1689  * Query SRQ attributes.
1690  */
1691 int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
1692 {
1693         struct siw_srq *srq = to_siw_srq(base_srq);
1694         unsigned long flags;
1695
1696         spin_lock_irqsave(&srq->lock, flags);
1697
1698         attrs->max_wr = srq->num_rqe;
1699         attrs->max_sge = srq->max_sge;
1700         attrs->srq_limit = srq->limit;
1701
1702         spin_unlock_irqrestore(&srq->lock, flags);
1703
1704         return 0;
1705 }
1706
1707 /*
1708  * siw_destroy_srq()
1709  *
1710  * Destroy SRQ.
1711  * It is assumed that the SRQ is not referenced by any
1712  * QP anymore - the code trusts the RDMA core environment to keep track
1713  * of QP references.
1714  */
1715 int siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
1716 {
1717         struct siw_srq *srq = to_siw_srq(base_srq);
1718         struct siw_device *sdev = to_siw_dev(base_srq->device);
1719         struct siw_ucontext *ctx =
1720                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1721                                           base_ucontext);
1722
1723         if (ctx)
1724                 rdma_user_mmap_entry_remove(srq->srq_entry);
1725         vfree(srq->recvq);
1726         atomic_dec(&sdev->num_srq);
1727         return 0;
1728 }
1729
1730 /*
1731  * siw_post_srq_recv()
1732  *
1733  * Post a list of receive queue elements to SRQ.
1734  * NOTE: The function does not check or lock a certain SRQ state
1735  *       during the post operation. The code simply trusts the
1736  *       RDMA core environment.
1737  *
1738  * @base_srq:   Base SRQ contained in siw SRQ
1739  * @wr:         List of R-WR's
1740  * @bad_wr:     Updated to failing WR if posting fails.
1741  */
1742 int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
1743                       const struct ib_recv_wr **bad_wr)
1744 {
1745         struct siw_srq *srq = to_siw_srq(base_srq);
1746         unsigned long flags;
1747         int rv = 0;
1748
1749         if (unlikely(!srq->is_kernel_res)) {
1750                 siw_dbg_pd(base_srq->pd,
1751                            "[SRQ]: no kernel post_recv for mapped srq\n");
1752                 rv = -EINVAL;
1753                 goto out;
1754         }
1755         /*
1756          * Serialize potentially multiple producers.
1757          * Also needed to serialize potentially multiple
1758          * consumers.
1759          */
1760         spin_lock_irqsave(&srq->lock, flags);
1761
1762         while (wr) {
1763                 u32 idx = srq->rq_put % srq->num_rqe;
1764                 struct siw_rqe *rqe = &srq->recvq[idx];
1765
1766                 if (rqe->flags) {
1767                         siw_dbg_pd(base_srq->pd, "SRQ full\n");
1768                         rv = -ENOMEM;
1769                         break;
1770                 }
1771                 if (unlikely(wr->num_sge > srq->max_sge)) {
1772                         siw_dbg_pd(base_srq->pd,
1773                                    "[SRQ]: too many sge's: %d\n", wr->num_sge);
1774                         rv = -EINVAL;
1775                         break;
1776                 }
1777                 rqe->id = wr->wr_id;
1778                 rqe->num_sge = wr->num_sge;
1779                 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1780
1781                 /* Make sure S-RQE is completely written before valid */
1782                 smp_wmb();
1783
1784                 rqe->flags = SIW_WQE_VALID;
1785
1786                 srq->rq_put++;
1787                 wr = wr->next;
1788         }
1789         spin_unlock_irqrestore(&srq->lock, flags);
1790 out:
1791         if (unlikely(rv < 0)) {
1792                 siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv);
1793                 *bad_wr = wr;
1794         }
1795         return rv;
1796 }
1797
1798 void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
1799 {
1800         struct ib_event event;
1801         struct ib_qp *base_qp = &qp->base_qp;
1802
1803         /*
1804          * Do not report asynchronous errors on QP which gets
1805          * destroyed via verbs interface (siw_destroy_qp())
1806          */
1807         if (qp->attrs.flags & SIW_QP_IN_DESTROY)
1808                 return;
1809
1810         event.event = etype;
1811         event.device = base_qp->device;
1812         event.element.qp = base_qp;
1813
1814         if (base_qp->event_handler) {
1815                 siw_dbg_qp(qp, "reporting event %d\n", etype);
1816                 base_qp->event_handler(&event, base_qp->qp_context);
1817         }
1818 }
1819
1820 void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
1821 {
1822         struct ib_event event;
1823         struct ib_cq *base_cq = &cq->base_cq;
1824
1825         event.event = etype;
1826         event.device = base_cq->device;
1827         event.element.cq = base_cq;
1828
1829         if (base_cq->event_handler) {
1830                 siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
1831                 base_cq->event_handler(&event, base_cq->cq_context);
1832         }
1833 }
1834
1835 void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
1836 {
1837         struct ib_event event;
1838         struct ib_srq *base_srq = &srq->base_srq;
1839
1840         event.event = etype;
1841         event.device = base_srq->device;
1842         event.element.srq = base_srq;
1843
1844         if (base_srq->event_handler) {
1845                 siw_dbg_pd(srq->base_srq.pd,
1846                            "reporting SRQ event %d\n", etype);
1847                 base_srq->event_handler(&event, base_srq->srq_context);
1848         }
1849 }
1850
1851 void siw_port_event(struct siw_device *sdev, u32 port, enum ib_event_type etype)
1852 {
1853         struct ib_event event;
1854
1855         event.event = etype;
1856         event.device = &sdev->base_dev;
1857         event.element.port_num = port;
1858
1859         siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
1860
1861         ib_dispatch_event(&event);
1862 }