net/mlx5: DR, Warn and ignore SW steering rule insertion on QP err
[linux-2.6-microblaze.git] / drivers / net / ethernet / mellanox / mlx5 / core / steering / dr_send.c
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2019 Mellanox Technologies. */
3
4 #include <linux/smp.h>
5 #include "dr_types.h"
6
7 #define QUEUE_SIZE 128
8 #define SIGNAL_PER_DIV_QUEUE 16
9 #define TH_NUMS_TO_DRAIN 2
10
11 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
12
13 struct dr_data_seg {
14         u64 addr;
15         u32 length;
16         u32 lkey;
17         unsigned int send_flags;
18 };
19
20 struct postsend_info {
21         struct dr_data_seg write;
22         struct dr_data_seg read;
23         u64 remote_addr;
24         u32 rkey;
25 };
26
27 struct dr_qp_rtr_attr {
28         struct mlx5dr_cmd_gid_attr dgid_attr;
29         enum ib_mtu mtu;
30         u32 qp_num;
31         u16 port_num;
32         u8 min_rnr_timer;
33         u8 sgid_index;
34         u16 udp_src_port;
35         u8 fl:1;
36 };
37
38 struct dr_qp_rts_attr {
39         u8 timeout;
40         u8 retry_cnt;
41         u8 rnr_retry;
42 };
43
44 struct dr_qp_init_attr {
45         u32 cqn;
46         u32 pdn;
47         u32 max_send_wr;
48         struct mlx5_uars_page *uar;
49         u8 isolate_vl_tc:1;
50 };
51
52 static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64)
53 {
54         unsigned int idx;
55         u8 opcode;
56
57         opcode = get_cqe_opcode(cqe64);
58         if (opcode == MLX5_CQE_REQ_ERR) {
59                 idx = be16_to_cpu(cqe64->wqe_counter) &
60                         (dr_cq->qp->sq.wqe_cnt - 1);
61                 dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
62         } else if (opcode == MLX5_CQE_RESP_ERR) {
63                 ++dr_cq->qp->sq.cc;
64         } else {
65                 idx = be16_to_cpu(cqe64->wqe_counter) &
66                         (dr_cq->qp->sq.wqe_cnt - 1);
67                 dr_cq->qp->sq.cc = dr_cq->qp->sq.wqe_head[idx] + 1;
68
69                 return CQ_OK;
70         }
71
72         return CQ_POLL_ERR;
73 }
74
75 static int dr_cq_poll_one(struct mlx5dr_cq *dr_cq)
76 {
77         struct mlx5_cqe64 *cqe64;
78         int err;
79
80         cqe64 = mlx5_cqwq_get_cqe(&dr_cq->wq);
81         if (!cqe64)
82                 return CQ_EMPTY;
83
84         mlx5_cqwq_pop(&dr_cq->wq);
85         err = dr_parse_cqe(dr_cq, cqe64);
86         mlx5_cqwq_update_db_record(&dr_cq->wq);
87
88         return err;
89 }
90
91 static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne)
92 {
93         int npolled;
94         int err = 0;
95
96         for (npolled = 0; npolled < ne; ++npolled) {
97                 err = dr_cq_poll_one(dr_cq);
98                 if (err != CQ_OK)
99                         break;
100         }
101
102         return err == CQ_POLL_ERR ? err : npolled;
103 }
104
105 static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev,
106                                          struct dr_qp_init_attr *attr)
107 {
108         u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
109         u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {};
110         struct mlx5_wq_param wqp;
111         struct mlx5dr_qp *dr_qp;
112         int inlen;
113         void *qpc;
114         void *in;
115         int err;
116
117         dr_qp = kzalloc(sizeof(*dr_qp), GFP_KERNEL);
118         if (!dr_qp)
119                 return NULL;
120
121         wqp.buf_numa_node = mdev->priv.numa_node;
122         wqp.db_numa_node = mdev->priv.numa_node;
123
124         dr_qp->rq.pc = 0;
125         dr_qp->rq.cc = 0;
126         dr_qp->rq.wqe_cnt = 4;
127         dr_qp->sq.pc = 0;
128         dr_qp->sq.cc = 0;
129         dr_qp->sq.wqe_cnt = roundup_pow_of_two(attr->max_send_wr);
130
131         MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
132         MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
133         MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
134         err = mlx5_wq_qp_create(mdev, &wqp, temp_qpc, &dr_qp->wq,
135                                 &dr_qp->wq_ctrl);
136         if (err) {
137                 mlx5_core_warn(mdev, "Can't create QP WQ\n");
138                 goto err_wq;
139         }
140
141         dr_qp->sq.wqe_head = kcalloc(dr_qp->sq.wqe_cnt,
142                                      sizeof(dr_qp->sq.wqe_head[0]),
143                                      GFP_KERNEL);
144
145         if (!dr_qp->sq.wqe_head) {
146                 mlx5_core_warn(mdev, "Can't allocate wqe head\n");
147                 goto err_wqe_head;
148         }
149
150         inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
151                 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
152                 dr_qp->wq_ctrl.buf.npages;
153         in = kvzalloc(inlen, GFP_KERNEL);
154         if (!in) {
155                 err = -ENOMEM;
156                 goto err_in;
157         }
158
159         qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
160         MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
161         MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
162         MLX5_SET(qpc, qpc, isolate_vl_tc, attr->isolate_vl_tc);
163         MLX5_SET(qpc, qpc, pd, attr->pdn);
164         MLX5_SET(qpc, qpc, uar_page, attr->uar->index);
165         MLX5_SET(qpc, qpc, log_page_size,
166                  dr_qp->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
167         MLX5_SET(qpc, qpc, fre, 1);
168         MLX5_SET(qpc, qpc, rlky, 1);
169         MLX5_SET(qpc, qpc, cqn_snd, attr->cqn);
170         MLX5_SET(qpc, qpc, cqn_rcv, attr->cqn);
171         MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
172         MLX5_SET(qpc, qpc, log_rq_size, ilog2(dr_qp->rq.wqe_cnt));
173         MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
174         MLX5_SET(qpc, qpc, log_sq_size, ilog2(dr_qp->sq.wqe_cnt));
175         MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
176         MLX5_SET64(qpc, qpc, dbr_addr, dr_qp->wq_ctrl.db.dma);
177         if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
178                 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
179         mlx5_fill_page_frag_array(&dr_qp->wq_ctrl.buf,
180                                   (__be64 *)MLX5_ADDR_OF(create_qp_in,
181                                                          in, pas));
182
183         MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
184         err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
185         dr_qp->qpn = MLX5_GET(create_qp_out, out, qpn);
186         kvfree(in);
187         if (err)
188                 goto err_in;
189         dr_qp->uar = attr->uar;
190
191         return dr_qp;
192
193 err_in:
194         kfree(dr_qp->sq.wqe_head);
195 err_wqe_head:
196         mlx5_wq_destroy(&dr_qp->wq_ctrl);
197 err_wq:
198         kfree(dr_qp);
199         return NULL;
200 }
201
202 static void dr_destroy_qp(struct mlx5_core_dev *mdev,
203                           struct mlx5dr_qp *dr_qp)
204 {
205         u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
206
207         MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
208         MLX5_SET(destroy_qp_in, in, qpn, dr_qp->qpn);
209         mlx5_cmd_exec_in(mdev, destroy_qp, in);
210
211         kfree(dr_qp->sq.wqe_head);
212         mlx5_wq_destroy(&dr_qp->wq_ctrl);
213         kfree(dr_qp);
214 }
215
216 static void dr_cmd_notify_hw(struct mlx5dr_qp *dr_qp, void *ctrl)
217 {
218         dma_wmb();
219         *dr_qp->wq.sq.db = cpu_to_be32(dr_qp->sq.pc & 0xffff);
220
221         /* After wmb() the hw aware of new work */
222         wmb();
223
224         mlx5_write64(ctrl, dr_qp->uar->map + MLX5_BF_OFFSET);
225 }
226
227 static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr,
228                              u32 rkey, struct dr_data_seg *data_seg,
229                              u32 opcode, bool notify_hw)
230 {
231         struct mlx5_wqe_raddr_seg *wq_raddr;
232         struct mlx5_wqe_ctrl_seg *wq_ctrl;
233         struct mlx5_wqe_data_seg *wq_dseg;
234         unsigned int size;
235         unsigned int idx;
236
237         size = sizeof(*wq_ctrl) / 16 + sizeof(*wq_dseg) / 16 +
238                 sizeof(*wq_raddr) / 16;
239
240         idx = dr_qp->sq.pc & (dr_qp->sq.wqe_cnt - 1);
241
242         wq_ctrl = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx);
243         wq_ctrl->imm = 0;
244         wq_ctrl->fm_ce_se = (data_seg->send_flags) ?
245                 MLX5_WQE_CTRL_CQ_UPDATE : 0;
246         wq_ctrl->opmod_idx_opcode = cpu_to_be32(((dr_qp->sq.pc & 0xffff) << 8) |
247                                                 opcode);
248         wq_ctrl->qpn_ds = cpu_to_be32(size | dr_qp->qpn << 8);
249         wq_raddr = (void *)(wq_ctrl + 1);
250         wq_raddr->raddr = cpu_to_be64(remote_addr);
251         wq_raddr->rkey = cpu_to_be32(rkey);
252         wq_raddr->reserved = 0;
253
254         wq_dseg = (void *)(wq_raddr + 1);
255         wq_dseg->byte_count = cpu_to_be32(data_seg->length);
256         wq_dseg->lkey = cpu_to_be32(data_seg->lkey);
257         wq_dseg->addr = cpu_to_be64(data_seg->addr);
258
259         dr_qp->sq.wqe_head[idx] = dr_qp->sq.pc++;
260
261         if (notify_hw)
262                 dr_cmd_notify_hw(dr_qp, wq_ctrl);
263 }
264
265 static void dr_post_send(struct mlx5dr_qp *dr_qp, struct postsend_info *send_info)
266 {
267         dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
268                          &send_info->write, MLX5_OPCODE_RDMA_WRITE, false);
269         dr_rdma_segments(dr_qp, send_info->remote_addr, send_info->rkey,
270                          &send_info->read, MLX5_OPCODE_RDMA_READ, true);
271 }
272
273 /**
274  * mlx5dr_send_fill_and_append_ste_send_info: Add data to be sent
275  * with send_list parameters:
276  *
277  *     @ste:       The data that attached to this specific ste
278  *     @size:      of data to write
279  *     @offset:    of the data from start of the hw_ste entry
280  *     @data:      data
281  *     @ste_info:  ste to be sent with send_list
282  *     @send_list: to append into it
283  *     @copy_data: if true indicates that the data should be kept because
284  *                 it's not backuped any where (like in re-hash).
285  *                 if false, it lets the data to be updated after
286  *                 it was added to the list.
287  */
288 void mlx5dr_send_fill_and_append_ste_send_info(struct mlx5dr_ste *ste, u16 size,
289                                                u16 offset, u8 *data,
290                                                struct mlx5dr_ste_send_info *ste_info,
291                                                struct list_head *send_list,
292                                                bool copy_data)
293 {
294         ste_info->size = size;
295         ste_info->ste = ste;
296         ste_info->offset = offset;
297
298         if (copy_data) {
299                 memcpy(ste_info->data_cont, data, size);
300                 ste_info->data = ste_info->data_cont;
301         } else {
302                 ste_info->data = data;
303         }
304
305         list_add_tail(&ste_info->send_list, send_list);
306 }
307
308 /* The function tries to consume one wc each time, unless the queue is full, in
309  * that case, which means that the hw is behind the sw in a full queue len
310  * the function will drain the cq till it empty.
311  */
312 static int dr_handle_pending_wc(struct mlx5dr_domain *dmn,
313                                 struct mlx5dr_send_ring *send_ring)
314 {
315         bool is_drain = false;
316         int ne;
317
318         if (send_ring->pending_wqe < send_ring->signal_th)
319                 return 0;
320
321         /* Queue is full start drain it */
322         if (send_ring->pending_wqe >=
323             dmn->send_ring->signal_th * TH_NUMS_TO_DRAIN)
324                 is_drain = true;
325
326         do {
327                 ne = dr_poll_cq(send_ring->cq, 1);
328                 if (unlikely(ne < 0)) {
329                         mlx5_core_warn_once(dmn->mdev, "SMFS QPN 0x%x is disabled/limited",
330                                             send_ring->qp->qpn);
331                         send_ring->err_state = true;
332                         return ne;
333                 } else if (ne == 1) {
334                         send_ring->pending_wqe -= send_ring->signal_th;
335                 }
336         } while (is_drain && send_ring->pending_wqe);
337
338         return 0;
339 }
340
341 static void dr_fill_data_segs(struct mlx5dr_send_ring *send_ring,
342                               struct postsend_info *send_info)
343 {
344         send_ring->pending_wqe++;
345
346         if (send_ring->pending_wqe % send_ring->signal_th == 0)
347                 send_info->write.send_flags |= IB_SEND_SIGNALED;
348
349         send_ring->pending_wqe++;
350         send_info->read.length = send_info->write.length;
351         /* Read into the same write area */
352         send_info->read.addr = (uintptr_t)send_info->write.addr;
353         send_info->read.lkey = send_ring->mr->mkey.key;
354
355         if (send_ring->pending_wqe % send_ring->signal_th == 0)
356                 send_info->read.send_flags = IB_SEND_SIGNALED;
357         else
358                 send_info->read.send_flags = 0;
359 }
360
361 static int dr_postsend_icm_data(struct mlx5dr_domain *dmn,
362                                 struct postsend_info *send_info)
363 {
364         struct mlx5dr_send_ring *send_ring = dmn->send_ring;
365         u32 buff_offset;
366         int ret;
367
368         if (unlikely(dmn->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR ||
369                      send_ring->err_state)) {
370                 mlx5_core_dbg_once(dmn->mdev,
371                                    "Skipping post send: QP err state: %d, device state: %d\n",
372                                    send_ring->err_state, dmn->mdev->state);
373                 return 0;
374         }
375
376         spin_lock(&send_ring->lock);
377
378         ret = dr_handle_pending_wc(dmn, send_ring);
379         if (ret)
380                 goto out_unlock;
381
382         if (send_info->write.length > dmn->info.max_inline_size) {
383                 buff_offset = (send_ring->tx_head &
384                                (dmn->send_ring->signal_th - 1)) *
385                         send_ring->max_post_send_size;
386                 /* Copy to ring mr */
387                 memcpy(send_ring->buf + buff_offset,
388                        (void *)(uintptr_t)send_info->write.addr,
389                        send_info->write.length);
390                 send_info->write.addr = (uintptr_t)send_ring->mr->dma_addr + buff_offset;
391                 send_info->write.lkey = send_ring->mr->mkey.key;
392         }
393
394         send_ring->tx_head++;
395         dr_fill_data_segs(send_ring, send_info);
396         dr_post_send(send_ring->qp, send_info);
397
398 out_unlock:
399         spin_unlock(&send_ring->lock);
400         return ret;
401 }
402
403 static int dr_get_tbl_copy_details(struct mlx5dr_domain *dmn,
404                                    struct mlx5dr_ste_htbl *htbl,
405                                    u8 **data,
406                                    u32 *byte_size,
407                                    int *iterations,
408                                    int *num_stes)
409 {
410         int alloc_size;
411
412         if (htbl->chunk->byte_size > dmn->send_ring->max_post_send_size) {
413                 *iterations = htbl->chunk->byte_size /
414                         dmn->send_ring->max_post_send_size;
415                 *byte_size = dmn->send_ring->max_post_send_size;
416                 alloc_size = *byte_size;
417                 *num_stes = *byte_size / DR_STE_SIZE;
418         } else {
419                 *iterations = 1;
420                 *num_stes = htbl->chunk->num_of_entries;
421                 alloc_size = *num_stes * DR_STE_SIZE;
422         }
423
424         *data = kvzalloc(alloc_size, GFP_KERNEL);
425         if (!*data)
426                 return -ENOMEM;
427
428         return 0;
429 }
430
431 /**
432  * mlx5dr_send_postsend_ste: write size bytes into offset from the hw cm.
433  *
434  *     @dmn:    Domain
435  *     @ste:    The ste struct that contains the data (at
436  *              least part of it)
437  *     @data:   The real data to send size data
438  *     @size:   for writing.
439  *     @offset: The offset from the icm mapped data to
440  *              start write to this for write only part of the
441  *              buffer.
442  *
443  * Return: 0 on success.
444  */
445 int mlx5dr_send_postsend_ste(struct mlx5dr_domain *dmn, struct mlx5dr_ste *ste,
446                              u8 *data, u16 size, u16 offset)
447 {
448         struct postsend_info send_info = {};
449
450         mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, data, size);
451
452         send_info.write.addr = (uintptr_t)data;
453         send_info.write.length = size;
454         send_info.write.lkey = 0;
455         send_info.remote_addr = mlx5dr_ste_get_mr_addr(ste) + offset;
456         send_info.rkey = ste->htbl->chunk->rkey;
457
458         return dr_postsend_icm_data(dmn, &send_info);
459 }
460
461 int mlx5dr_send_postsend_htbl(struct mlx5dr_domain *dmn,
462                               struct mlx5dr_ste_htbl *htbl,
463                               u8 *formatted_ste, u8 *mask)
464 {
465         u32 byte_size = htbl->chunk->byte_size;
466         int num_stes_per_iter;
467         int iterations;
468         u8 *data;
469         int ret;
470         int i;
471         int j;
472
473         ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
474                                       &iterations, &num_stes_per_iter);
475         if (ret)
476                 return ret;
477
478         mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, formatted_ste, DR_STE_SIZE);
479
480         /* Send the data iteration times */
481         for (i = 0; i < iterations; i++) {
482                 u32 ste_index = i * (byte_size / DR_STE_SIZE);
483                 struct postsend_info send_info = {};
484
485                 /* Copy all ste's on the data buffer
486                  * need to add the bit_mask
487                  */
488                 for (j = 0; j < num_stes_per_iter; j++) {
489                         struct mlx5dr_ste *ste = &htbl->ste_arr[ste_index + j];
490                         u32 ste_off = j * DR_STE_SIZE;
491
492                         if (mlx5dr_ste_is_not_used(ste)) {
493                                 memcpy(data + ste_off,
494                                        formatted_ste, DR_STE_SIZE);
495                         } else {
496                                 /* Copy data */
497                                 memcpy(data + ste_off,
498                                        htbl->ste_arr[ste_index + j].hw_ste,
499                                        DR_STE_SIZE_REDUCED);
500                                 /* Copy bit_mask */
501                                 memcpy(data + ste_off + DR_STE_SIZE_REDUCED,
502                                        mask, DR_STE_SIZE_MASK);
503                                 /* Only when we have mask we need to re-arrange the STE */
504                                 mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx,
505                                                                 data + (j * DR_STE_SIZE),
506                                                                 DR_STE_SIZE);
507                         }
508                 }
509
510                 send_info.write.addr = (uintptr_t)data;
511                 send_info.write.length = byte_size;
512                 send_info.write.lkey = 0;
513                 send_info.remote_addr =
514                         mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
515                 send_info.rkey = htbl->chunk->rkey;
516
517                 ret = dr_postsend_icm_data(dmn, &send_info);
518                 if (ret)
519                         goto out_free;
520         }
521
522 out_free:
523         kvfree(data);
524         return ret;
525 }
526
527 /* Initialize htble with default STEs */
528 int mlx5dr_send_postsend_formatted_htbl(struct mlx5dr_domain *dmn,
529                                         struct mlx5dr_ste_htbl *htbl,
530                                         u8 *ste_init_data,
531                                         bool update_hw_ste)
532 {
533         u32 byte_size = htbl->chunk->byte_size;
534         int iterations;
535         int num_stes;
536         u8 *copy_dst;
537         u8 *data;
538         int ret;
539         int i;
540
541         ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size,
542                                       &iterations, &num_stes);
543         if (ret)
544                 return ret;
545
546         if (update_hw_ste) {
547                 /* Copy the reduced STE to hash table ste_arr */
548                 for (i = 0; i < num_stes; i++) {
549                         copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED;
550                         memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED);
551                 }
552         }
553
554         mlx5dr_ste_prepare_for_postsend(dmn->ste_ctx, ste_init_data, DR_STE_SIZE);
555
556         /* Copy the same STE on the data buffer */
557         for (i = 0; i < num_stes; i++) {
558                 copy_dst = data + i * DR_STE_SIZE;
559                 memcpy(copy_dst, ste_init_data, DR_STE_SIZE);
560         }
561
562         /* Send the data iteration times */
563         for (i = 0; i < iterations; i++) {
564                 u8 ste_index = i * (byte_size / DR_STE_SIZE);
565                 struct postsend_info send_info = {};
566
567                 send_info.write.addr = (uintptr_t)data;
568                 send_info.write.length = byte_size;
569                 send_info.write.lkey = 0;
570                 send_info.remote_addr =
571                         mlx5dr_ste_get_mr_addr(htbl->ste_arr + ste_index);
572                 send_info.rkey = htbl->chunk->rkey;
573
574                 ret = dr_postsend_icm_data(dmn, &send_info);
575                 if (ret)
576                         goto out_free;
577         }
578
579 out_free:
580         kvfree(data);
581         return ret;
582 }
583
584 int mlx5dr_send_postsend_action(struct mlx5dr_domain *dmn,
585                                 struct mlx5dr_action *action)
586 {
587         struct postsend_info send_info = {};
588         int ret;
589
590         send_info.write.addr = (uintptr_t)action->rewrite->data;
591         send_info.write.length = action->rewrite->num_of_actions *
592                                  DR_MODIFY_ACTION_SIZE;
593         send_info.write.lkey = 0;
594         send_info.remote_addr = action->rewrite->chunk->mr_addr;
595         send_info.rkey = action->rewrite->chunk->rkey;
596
597         ret = dr_postsend_icm_data(dmn, &send_info);
598
599         return ret;
600 }
601
602 static int dr_modify_qp_rst2init(struct mlx5_core_dev *mdev,
603                                  struct mlx5dr_qp *dr_qp,
604                                  int port)
605 {
606         u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
607         void *qpc;
608
609         qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc);
610
611         MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, port);
612         MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
613         MLX5_SET(qpc, qpc, rre, 1);
614         MLX5_SET(qpc, qpc, rwe, 1);
615
616         MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP);
617         MLX5_SET(rst2init_qp_in, in, qpn, dr_qp->qpn);
618
619         return mlx5_cmd_exec_in(mdev, rst2init_qp, in);
620 }
621
622 static int dr_cmd_modify_qp_rtr2rts(struct mlx5_core_dev *mdev,
623                                     struct mlx5dr_qp *dr_qp,
624                                     struct dr_qp_rts_attr *attr)
625 {
626         u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
627         void *qpc;
628
629         qpc  = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc);
630
631         MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
632
633         MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt);
634         MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry);
635         MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
636
637         MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
638         MLX5_SET(rtr2rts_qp_in, in, qpn, dr_qp->qpn);
639
640         return mlx5_cmd_exec_in(mdev, rtr2rts_qp, in);
641 }
642
643 static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev,
644                                      struct mlx5dr_qp *dr_qp,
645                                      struct dr_qp_rtr_attr *attr)
646 {
647         u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
648         void *qpc;
649
650         qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc);
651
652         MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
653
654         MLX5_SET(qpc, qpc, mtu, attr->mtu);
655         MLX5_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1);
656         MLX5_SET(qpc, qpc, remote_qpn, attr->qp_num);
657         memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32),
658                attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac));
659         memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
660                attr->dgid_attr.gid, sizeof(attr->dgid_attr.gid));
661         MLX5_SET(qpc, qpc, primary_address_path.src_addr_index,
662                  attr->sgid_index);
663
664         if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2)
665                 MLX5_SET(qpc, qpc, primary_address_path.udp_sport,
666                          attr->udp_src_port);
667
668         MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num);
669         MLX5_SET(qpc, qpc, primary_address_path.fl, attr->fl);
670         MLX5_SET(qpc, qpc, min_rnr_nak, 1);
671
672         MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
673         MLX5_SET(init2rtr_qp_in, in, qpn, dr_qp->qpn);
674
675         return mlx5_cmd_exec_in(mdev, init2rtr_qp, in);
676 }
677
678 static bool dr_send_allow_fl(struct mlx5dr_cmd_caps *caps)
679 {
680         /* Check whether RC RoCE QP creation with force loopback is allowed.
681          * There are two separate capability bits for this:
682          *  - force loopback when RoCE is enabled
683          *  - force loopback when RoCE is disabled
684          */
685         return ((caps->roce_caps.roce_en &&
686                  caps->roce_caps.fl_rc_qp_when_roce_enabled) ||
687                 (!caps->roce_caps.roce_en &&
688                  caps->roce_caps.fl_rc_qp_when_roce_disabled));
689 }
690
691 static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn)
692 {
693         struct mlx5dr_qp *dr_qp = dmn->send_ring->qp;
694         struct dr_qp_rts_attr rts_attr = {};
695         struct dr_qp_rtr_attr rtr_attr = {};
696         enum ib_mtu mtu = IB_MTU_1024;
697         u16 gid_index = 0;
698         int port = 1;
699         int ret;
700
701         /* Init */
702         ret = dr_modify_qp_rst2init(dmn->mdev, dr_qp, port);
703         if (ret) {
704                 mlx5dr_err(dmn, "Failed modify QP rst2init\n");
705                 return ret;
706         }
707
708         /* RTR */
709         rtr_attr.mtu            = mtu;
710         rtr_attr.qp_num         = dr_qp->qpn;
711         rtr_attr.min_rnr_timer  = 12;
712         rtr_attr.port_num       = port;
713         rtr_attr.udp_src_port   = dmn->info.caps.roce_min_src_udp;
714
715         /* If QP creation with force loopback is allowed, then there
716          * is no need for GID index when creating the QP.
717          * Otherwise we query GID attributes and use GID index.
718          */
719         rtr_attr.fl = dr_send_allow_fl(&dmn->info.caps);
720         if (!rtr_attr.fl) {
721                 ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index,
722                                            &rtr_attr.dgid_attr);
723                 if (ret)
724                         return ret;
725
726                 rtr_attr.sgid_index = gid_index;
727         }
728
729         ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr);
730         if (ret) {
731                 mlx5dr_err(dmn, "Failed modify QP init2rtr\n");
732                 return ret;
733         }
734
735         /* RTS */
736         rts_attr.timeout        = 14;
737         rts_attr.retry_cnt      = 7;
738         rts_attr.rnr_retry      = 7;
739
740         ret = dr_cmd_modify_qp_rtr2rts(dmn->mdev, dr_qp, &rts_attr);
741         if (ret) {
742                 mlx5dr_err(dmn, "Failed modify QP rtr2rts\n");
743                 return ret;
744         }
745
746         return 0;
747 }
748
749 static void dr_cq_complete(struct mlx5_core_cq *mcq,
750                            struct mlx5_eqe *eqe)
751 {
752         pr_err("CQ completion CQ: #%u\n", mcq->cqn);
753 }
754
755 static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
756                                       struct mlx5_uars_page *uar,
757                                       size_t ncqe)
758 {
759         u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {};
760         u32 out[MLX5_ST_SZ_DW(create_cq_out)];
761         struct mlx5_wq_param wqp;
762         struct mlx5_cqe64 *cqe;
763         struct mlx5dr_cq *cq;
764         int inlen, err, eqn;
765         void *cqc, *in;
766         __be64 *pas;
767         int vector;
768         u32 i;
769
770         cq = kzalloc(sizeof(*cq), GFP_KERNEL);
771         if (!cq)
772                 return NULL;
773
774         ncqe = roundup_pow_of_two(ncqe);
775         MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(ncqe));
776
777         wqp.buf_numa_node = mdev->priv.numa_node;
778         wqp.db_numa_node = mdev->priv.numa_node;
779
780         err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &cq->wq,
781                                &cq->wq_ctrl);
782         if (err)
783                 goto out;
784
785         for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
786                 cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
787                 cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK;
788         }
789
790         inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
791                 sizeof(u64) * cq->wq_ctrl.buf.npages;
792         in = kvzalloc(inlen, GFP_KERNEL);
793         if (!in)
794                 goto err_cqwq;
795
796         vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
797         err = mlx5_vector2eqn(mdev, vector, &eqn);
798         if (err) {
799                 kvfree(in);
800                 goto err_cqwq;
801         }
802
803         cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
804         MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
805         MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
806         MLX5_SET(cqc, cqc, uar_page, uar->index);
807         MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
808                  MLX5_ADAPTER_PAGE_SHIFT);
809         MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma);
810
811         pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
812         mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas);
813
814         cq->mcq.comp  = dr_cq_complete;
815
816         err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
817         kvfree(in);
818
819         if (err)
820                 goto err_cqwq;
821
822         cq->mcq.cqe_sz = 64;
823         cq->mcq.set_ci_db = cq->wq_ctrl.db.db;
824         cq->mcq.arm_db = cq->wq_ctrl.db.db + 1;
825         *cq->mcq.set_ci_db = 0;
826
827         /* set no-zero value, in order to avoid the HW to run db-recovery on
828          * CQ that used in polling mode.
829          */
830         *cq->mcq.arm_db = cpu_to_be32(2 << 28);
831
832         cq->mcq.vector = 0;
833         cq->mcq.uar = uar;
834
835         return cq;
836
837 err_cqwq:
838         mlx5_wq_destroy(&cq->wq_ctrl);
839 out:
840         kfree(cq);
841         return NULL;
842 }
843
844 static void dr_destroy_cq(struct mlx5_core_dev *mdev, struct mlx5dr_cq *cq)
845 {
846         mlx5_core_destroy_cq(mdev, &cq->mcq);
847         mlx5_wq_destroy(&cq->wq_ctrl);
848         kfree(cq);
849 }
850
851 static int
852 dr_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_core_mkey *mkey)
853 {
854         u32 in[MLX5_ST_SZ_DW(create_mkey_in)] = {};
855         void *mkc;
856
857         mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
858         MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
859         MLX5_SET(mkc, mkc, a, 1);
860         MLX5_SET(mkc, mkc, rw, 1);
861         MLX5_SET(mkc, mkc, rr, 1);
862         MLX5_SET(mkc, mkc, lw, 1);
863         MLX5_SET(mkc, mkc, lr, 1);
864
865         MLX5_SET(mkc, mkc, pd, pdn);
866         MLX5_SET(mkc, mkc, length64, 1);
867         MLX5_SET(mkc, mkc, qpn, 0xffffff);
868
869         return mlx5_core_create_mkey(mdev, mkey, in, sizeof(in));
870 }
871
872 static struct mlx5dr_mr *dr_reg_mr(struct mlx5_core_dev *mdev,
873                                    u32 pdn, void *buf, size_t size)
874 {
875         struct mlx5dr_mr *mr = kzalloc(sizeof(*mr), GFP_KERNEL);
876         struct device *dma_device;
877         dma_addr_t dma_addr;
878         int err;
879
880         if (!mr)
881                 return NULL;
882
883         dma_device = mlx5_core_dma_dev(mdev);
884         dma_addr = dma_map_single(dma_device, buf, size,
885                                   DMA_BIDIRECTIONAL);
886         err = dma_mapping_error(dma_device, dma_addr);
887         if (err) {
888                 mlx5_core_warn(mdev, "Can't dma buf\n");
889                 kfree(mr);
890                 return NULL;
891         }
892
893         err = dr_create_mkey(mdev, pdn, &mr->mkey);
894         if (err) {
895                 mlx5_core_warn(mdev, "Can't create mkey\n");
896                 dma_unmap_single(dma_device, dma_addr, size,
897                                  DMA_BIDIRECTIONAL);
898                 kfree(mr);
899                 return NULL;
900         }
901
902         mr->dma_addr = dma_addr;
903         mr->size = size;
904         mr->addr = buf;
905
906         return mr;
907 }
908
909 static void dr_dereg_mr(struct mlx5_core_dev *mdev, struct mlx5dr_mr *mr)
910 {
911         mlx5_core_destroy_mkey(mdev, &mr->mkey);
912         dma_unmap_single(mlx5_core_dma_dev(mdev), mr->dma_addr, mr->size,
913                          DMA_BIDIRECTIONAL);
914         kfree(mr);
915 }
916
917 int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn)
918 {
919         struct dr_qp_init_attr init_attr = {};
920         int cq_size;
921         int size;
922         int ret;
923
924         dmn->send_ring = kzalloc(sizeof(*dmn->send_ring), GFP_KERNEL);
925         if (!dmn->send_ring)
926                 return -ENOMEM;
927
928         cq_size = QUEUE_SIZE + 1;
929         dmn->send_ring->cq = dr_create_cq(dmn->mdev, dmn->uar, cq_size);
930         if (!dmn->send_ring->cq) {
931                 mlx5dr_err(dmn, "Failed creating CQ\n");
932                 ret = -ENOMEM;
933                 goto free_send_ring;
934         }
935
936         init_attr.cqn = dmn->send_ring->cq->mcq.cqn;
937         init_attr.pdn = dmn->pdn;
938         init_attr.uar = dmn->uar;
939         init_attr.max_send_wr = QUEUE_SIZE;
940
941         /* Isolated VL is applicable only if force loopback is supported */
942         if (dr_send_allow_fl(&dmn->info.caps))
943                 init_attr.isolate_vl_tc = dmn->info.caps.isolate_vl_tc;
944
945         spin_lock_init(&dmn->send_ring->lock);
946
947         dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr);
948         if (!dmn->send_ring->qp)  {
949                 mlx5dr_err(dmn, "Failed creating QP\n");
950                 ret = -ENOMEM;
951                 goto clean_cq;
952         }
953
954         dmn->send_ring->cq->qp = dmn->send_ring->qp;
955
956         dmn->info.max_send_wr = QUEUE_SIZE;
957         dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data,
958                                         DR_STE_SIZE);
959
960         dmn->send_ring->signal_th = dmn->info.max_send_wr /
961                 SIGNAL_PER_DIV_QUEUE;
962
963         /* Prepare qp to be used */
964         ret = dr_prepare_qp_to_rts(dmn);
965         if (ret)
966                 goto clean_qp;
967
968         dmn->send_ring->max_post_send_size =
969                 mlx5dr_icm_pool_chunk_size_to_byte(DR_CHUNK_SIZE_1K,
970                                                    DR_ICM_TYPE_STE);
971
972         /* Allocating the max size as a buffer for writing */
973         size = dmn->send_ring->signal_th * dmn->send_ring->max_post_send_size;
974         dmn->send_ring->buf = kzalloc(size, GFP_KERNEL);
975         if (!dmn->send_ring->buf) {
976                 ret = -ENOMEM;
977                 goto clean_qp;
978         }
979
980         dmn->send_ring->buf_size = size;
981
982         dmn->send_ring->mr = dr_reg_mr(dmn->mdev,
983                                        dmn->pdn, dmn->send_ring->buf, size);
984         if (!dmn->send_ring->mr) {
985                 ret = -ENOMEM;
986                 goto free_mem;
987         }
988
989         dmn->send_ring->sync_mr = dr_reg_mr(dmn->mdev,
990                                             dmn->pdn, dmn->send_ring->sync_buff,
991                                             MIN_READ_SYNC);
992         if (!dmn->send_ring->sync_mr) {
993                 ret = -ENOMEM;
994                 goto clean_mr;
995         }
996
997         return 0;
998
999 clean_mr:
1000         dr_dereg_mr(dmn->mdev, dmn->send_ring->mr);
1001 free_mem:
1002         kfree(dmn->send_ring->buf);
1003 clean_qp:
1004         dr_destroy_qp(dmn->mdev, dmn->send_ring->qp);
1005 clean_cq:
1006         dr_destroy_cq(dmn->mdev, dmn->send_ring->cq);
1007 free_send_ring:
1008         kfree(dmn->send_ring);
1009
1010         return ret;
1011 }
1012
1013 void mlx5dr_send_ring_free(struct mlx5dr_domain *dmn,
1014                            struct mlx5dr_send_ring *send_ring)
1015 {
1016         dr_destroy_qp(dmn->mdev, send_ring->qp);
1017         dr_destroy_cq(dmn->mdev, send_ring->cq);
1018         dr_dereg_mr(dmn->mdev, send_ring->sync_mr);
1019         dr_dereg_mr(dmn->mdev, send_ring->mr);
1020         kfree(send_ring->buf);
1021         kfree(send_ring);
1022 }
1023
1024 int mlx5dr_send_ring_force_drain(struct mlx5dr_domain *dmn)
1025 {
1026         struct mlx5dr_send_ring *send_ring = dmn->send_ring;
1027         struct postsend_info send_info = {};
1028         u8 data[DR_STE_SIZE];
1029         int num_of_sends_req;
1030         int ret;
1031         int i;
1032
1033         /* Sending this amount of requests makes sure we will get drain */
1034         num_of_sends_req = send_ring->signal_th * TH_NUMS_TO_DRAIN / 2;
1035
1036         /* Send fake requests forcing the last to be signaled */
1037         send_info.write.addr = (uintptr_t)data;
1038         send_info.write.length = DR_STE_SIZE;
1039         send_info.write.lkey = 0;
1040         /* Using the sync_mr in order to write/read */
1041         send_info.remote_addr = (uintptr_t)send_ring->sync_mr->addr;
1042         send_info.rkey = send_ring->sync_mr->mkey.key;
1043
1044         for (i = 0; i < num_of_sends_req; i++) {
1045                 ret = dr_postsend_icm_data(dmn, &send_info);
1046                 if (ret)
1047                         return ret;
1048         }
1049
1050         spin_lock(&send_ring->lock);
1051         ret = dr_handle_pending_wc(dmn, send_ring);
1052         spin_unlock(&send_ring->lock);
1053
1054         return ret;
1055 }