Merge tag 'zynqmp-soc-for-v5.7' of https://github.com/Xilinx/linux-xlnx into arm/soc
[linux-2.6-microblaze.git] / drivers / infiniband / hw / mlx5 / main.c
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/module.h>
36 #include <linux/init.h>
37 #include <linux/errno.h>
38 #include <linux/pci.h>
39 #include <linux/dma-mapping.h>
40 #include <linux/slab.h>
41 #include <linux/bitmap.h>
42 #if defined(CONFIG_X86)
43 #include <asm/memtype.h>
44 #endif
45 #include <linux/sched.h>
46 #include <linux/sched/mm.h>
47 #include <linux/sched/task.h>
48 #include <linux/delay.h>
49 #include <rdma/ib_user_verbs.h>
50 #include <rdma/ib_addr.h>
51 #include <rdma/ib_cache.h>
52 #include <linux/mlx5/port.h>
53 #include <linux/mlx5/vport.h>
54 #include <linux/mlx5/fs.h>
55 #include <linux/mlx5/eswitch.h>
56 #include <linux/list.h>
57 #include <rdma/ib_smi.h>
58 #include <rdma/ib_umem.h>
59 #include <linux/in.h>
60 #include <linux/etherdevice.h>
61 #include "mlx5_ib.h"
62 #include "ib_rep.h"
63 #include "cmd.h"
64 #include "srq.h"
65 #include <linux/mlx5/fs_helpers.h>
66 #include <linux/mlx5/accel.h>
67 #include <rdma/uverbs_std_types.h>
68 #include <rdma/mlx5_user_ioctl_verbs.h>
69 #include <rdma/mlx5_user_ioctl_cmds.h>
70 #include <rdma/ib_umem_odp.h>
71
72 #define UVERBS_MODULE_NAME mlx5_ib
73 #include <rdma/uverbs_named_ioctl.h>
74
75 #define DRIVER_NAME "mlx5_ib"
76 #define DRIVER_VERSION "5.0-0"
77
78 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
79 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
80 MODULE_LICENSE("Dual BSD/GPL");
81
82 static char mlx5_version[] =
83         DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
84         DRIVER_VERSION "\n";
85
86 struct mlx5_ib_event_work {
87         struct work_struct      work;
88         union {
89                 struct mlx5_ib_dev            *dev;
90                 struct mlx5_ib_multiport_info *mpi;
91         };
92         bool                    is_slave;
93         unsigned int            event;
94         void                    *param;
95 };
96
97 enum {
98         MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
99 };
100
101 static struct workqueue_struct *mlx5_ib_event_wq;
102 static LIST_HEAD(mlx5_ib_unaffiliated_port_list);
103 static LIST_HEAD(mlx5_ib_dev_list);
104 /*
105  * This mutex should be held when accessing either of the above lists
106  */
107 static DEFINE_MUTEX(mlx5_ib_multiport_mutex);
108
109 /* We can't use an array for xlt_emergency_page because dma_map_single
110  * doesn't work on kernel modules memory
111  */
112 static unsigned long xlt_emergency_page;
113 static struct mutex xlt_emergency_page_mutex;
114
115 struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi)
116 {
117         struct mlx5_ib_dev *dev;
118
119         mutex_lock(&mlx5_ib_multiport_mutex);
120         dev = mpi->ibdev;
121         mutex_unlock(&mlx5_ib_multiport_mutex);
122         return dev;
123 }
124
125 static enum rdma_link_layer
126 mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
127 {
128         switch (port_type_cap) {
129         case MLX5_CAP_PORT_TYPE_IB:
130                 return IB_LINK_LAYER_INFINIBAND;
131         case MLX5_CAP_PORT_TYPE_ETH:
132                 return IB_LINK_LAYER_ETHERNET;
133         default:
134                 return IB_LINK_LAYER_UNSPECIFIED;
135         }
136 }
137
138 static enum rdma_link_layer
139 mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
140 {
141         struct mlx5_ib_dev *dev = to_mdev(device);
142         int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
143
144         return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
145 }
146
147 static int get_port_state(struct ib_device *ibdev,
148                           u8 port_num,
149                           enum ib_port_state *state)
150 {
151         struct ib_port_attr attr;
152         int ret;
153
154         memset(&attr, 0, sizeof(attr));
155         ret = ibdev->ops.query_port(ibdev, port_num, &attr);
156         if (!ret)
157                 *state = attr.state;
158         return ret;
159 }
160
161 static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
162                                            struct net_device *ndev,
163                                            u8 *port_num)
164 {
165         struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
166         struct net_device *rep_ndev;
167         struct mlx5_ib_port *port;
168         int i;
169
170         for (i = 0; i < dev->num_ports; i++) {
171                 port  = &dev->port[i];
172                 if (!port->rep)
173                         continue;
174
175                 read_lock(&port->roce.netdev_lock);
176                 rep_ndev = mlx5_ib_get_rep_netdev(esw,
177                                                   port->rep->vport);
178                 if (rep_ndev == ndev) {
179                         read_unlock(&port->roce.netdev_lock);
180                         *port_num = i + 1;
181                         return &port->roce;
182                 }
183                 read_unlock(&port->roce.netdev_lock);
184         }
185
186         return NULL;
187 }
188
189 static int mlx5_netdev_event(struct notifier_block *this,
190                              unsigned long event, void *ptr)
191 {
192         struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
193         struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
194         u8 port_num = roce->native_port_num;
195         struct mlx5_core_dev *mdev;
196         struct mlx5_ib_dev *ibdev;
197
198         ibdev = roce->dev;
199         mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
200         if (!mdev)
201                 return NOTIFY_DONE;
202
203         switch (event) {
204         case NETDEV_REGISTER:
205                 /* Should already be registered during the load */
206                 if (ibdev->is_rep)
207                         break;
208                 write_lock(&roce->netdev_lock);
209                 if (ndev->dev.parent == mdev->device)
210                         roce->netdev = ndev;
211                 write_unlock(&roce->netdev_lock);
212                 break;
213
214         case NETDEV_UNREGISTER:
215                 /* In case of reps, ib device goes away before the netdevs */
216                 write_lock(&roce->netdev_lock);
217                 if (roce->netdev == ndev)
218                         roce->netdev = NULL;
219                 write_unlock(&roce->netdev_lock);
220                 break;
221
222         case NETDEV_CHANGE:
223         case NETDEV_UP:
224         case NETDEV_DOWN: {
225                 struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev);
226                 struct net_device *upper = NULL;
227
228                 if (lag_ndev) {
229                         upper = netdev_master_upper_dev_get(lag_ndev);
230                         dev_put(lag_ndev);
231                 }
232
233                 if (ibdev->is_rep)
234                         roce = mlx5_get_rep_roce(ibdev, ndev, &port_num);
235                 if (!roce)
236                         return NOTIFY_DONE;
237                 if ((upper == ndev || (!upper && ndev == roce->netdev))
238                     && ibdev->ib_active) {
239                         struct ib_event ibev = { };
240                         enum ib_port_state port_state;
241
242                         if (get_port_state(&ibdev->ib_dev, port_num,
243                                            &port_state))
244                                 goto done;
245
246                         if (roce->last_port_state == port_state)
247                                 goto done;
248
249                         roce->last_port_state = port_state;
250                         ibev.device = &ibdev->ib_dev;
251                         if (port_state == IB_PORT_DOWN)
252                                 ibev.event = IB_EVENT_PORT_ERR;
253                         else if (port_state == IB_PORT_ACTIVE)
254                                 ibev.event = IB_EVENT_PORT_ACTIVE;
255                         else
256                                 goto done;
257
258                         ibev.element.port_num = port_num;
259                         ib_dispatch_event(&ibev);
260                 }
261                 break;
262         }
263
264         default:
265                 break;
266         }
267 done:
268         mlx5_ib_put_native_port_mdev(ibdev, port_num);
269         return NOTIFY_DONE;
270 }
271
272 static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
273                                              u8 port_num)
274 {
275         struct mlx5_ib_dev *ibdev = to_mdev(device);
276         struct net_device *ndev;
277         struct mlx5_core_dev *mdev;
278
279         mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
280         if (!mdev)
281                 return NULL;
282
283         ndev = mlx5_lag_get_roce_netdev(mdev);
284         if (ndev)
285                 goto out;
286
287         /* Ensure ndev does not disappear before we invoke dev_hold()
288          */
289         read_lock(&ibdev->port[port_num - 1].roce.netdev_lock);
290         ndev = ibdev->port[port_num - 1].roce.netdev;
291         if (ndev)
292                 dev_hold(ndev);
293         read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock);
294
295 out:
296         mlx5_ib_put_native_port_mdev(ibdev, port_num);
297         return ndev;
298 }
299
300 struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
301                                                    u8 ib_port_num,
302                                                    u8 *native_port_num)
303 {
304         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
305                                                           ib_port_num);
306         struct mlx5_core_dev *mdev = NULL;
307         struct mlx5_ib_multiport_info *mpi;
308         struct mlx5_ib_port *port;
309
310         if (!mlx5_core_mp_enabled(ibdev->mdev) ||
311             ll != IB_LINK_LAYER_ETHERNET) {
312                 if (native_port_num)
313                         *native_port_num = ib_port_num;
314                 return ibdev->mdev;
315         }
316
317         if (native_port_num)
318                 *native_port_num = 1;
319
320         port = &ibdev->port[ib_port_num - 1];
321         if (!port)
322                 return NULL;
323
324         spin_lock(&port->mp.mpi_lock);
325         mpi = ibdev->port[ib_port_num - 1].mp.mpi;
326         if (mpi && !mpi->unaffiliate) {
327                 mdev = mpi->mdev;
328                 /* If it's the master no need to refcount, it'll exist
329                  * as long as the ib_dev exists.
330                  */
331                 if (!mpi->is_master)
332                         mpi->mdev_refcnt++;
333         }
334         spin_unlock(&port->mp.mpi_lock);
335
336         return mdev;
337 }
338
339 void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num)
340 {
341         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
342                                                           port_num);
343         struct mlx5_ib_multiport_info *mpi;
344         struct mlx5_ib_port *port;
345
346         if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
347                 return;
348
349         port = &ibdev->port[port_num - 1];
350
351         spin_lock(&port->mp.mpi_lock);
352         mpi = ibdev->port[port_num - 1].mp.mpi;
353         if (mpi->is_master)
354                 goto out;
355
356         mpi->mdev_refcnt--;
357         if (mpi->unaffiliate)
358                 complete(&mpi->unref_comp);
359 out:
360         spin_unlock(&port->mp.mpi_lock);
361 }
362
363 static int translate_eth_legacy_proto_oper(u32 eth_proto_oper, u8 *active_speed,
364                                            u8 *active_width)
365 {
366         switch (eth_proto_oper) {
367         case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
368         case MLX5E_PROT_MASK(MLX5E_1000BASE_KX):
369         case MLX5E_PROT_MASK(MLX5E_100BASE_TX):
370         case MLX5E_PROT_MASK(MLX5E_1000BASE_T):
371                 *active_width = IB_WIDTH_1X;
372                 *active_speed = IB_SPEED_SDR;
373                 break;
374         case MLX5E_PROT_MASK(MLX5E_10GBASE_T):
375         case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4):
376         case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4):
377         case MLX5E_PROT_MASK(MLX5E_10GBASE_KR):
378         case MLX5E_PROT_MASK(MLX5E_10GBASE_CR):
379         case MLX5E_PROT_MASK(MLX5E_10GBASE_SR):
380         case MLX5E_PROT_MASK(MLX5E_10GBASE_ER):
381                 *active_width = IB_WIDTH_1X;
382                 *active_speed = IB_SPEED_QDR;
383                 break;
384         case MLX5E_PROT_MASK(MLX5E_25GBASE_CR):
385         case MLX5E_PROT_MASK(MLX5E_25GBASE_KR):
386         case MLX5E_PROT_MASK(MLX5E_25GBASE_SR):
387                 *active_width = IB_WIDTH_1X;
388                 *active_speed = IB_SPEED_EDR;
389                 break;
390         case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4):
391         case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4):
392         case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4):
393         case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4):
394                 *active_width = IB_WIDTH_4X;
395                 *active_speed = IB_SPEED_QDR;
396                 break;
397         case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2):
398         case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2):
399         case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2):
400                 *active_width = IB_WIDTH_1X;
401                 *active_speed = IB_SPEED_HDR;
402                 break;
403         case MLX5E_PROT_MASK(MLX5E_56GBASE_R4):
404                 *active_width = IB_WIDTH_4X;
405                 *active_speed = IB_SPEED_FDR;
406                 break;
407         case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4):
408         case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4):
409         case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4):
410         case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4):
411                 *active_width = IB_WIDTH_4X;
412                 *active_speed = IB_SPEED_EDR;
413                 break;
414         default:
415                 return -EINVAL;
416         }
417
418         return 0;
419 }
420
421 static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u8 *active_speed,
422                                         u8 *active_width)
423 {
424         switch (eth_proto_oper) {
425         case MLX5E_PROT_MASK(MLX5E_SGMII_100M):
426         case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII):
427                 *active_width = IB_WIDTH_1X;
428                 *active_speed = IB_SPEED_SDR;
429                 break;
430         case MLX5E_PROT_MASK(MLX5E_5GBASE_R):
431                 *active_width = IB_WIDTH_1X;
432                 *active_speed = IB_SPEED_DDR;
433                 break;
434         case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1):
435                 *active_width = IB_WIDTH_1X;
436                 *active_speed = IB_SPEED_QDR;
437                 break;
438         case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4):
439                 *active_width = IB_WIDTH_4X;
440                 *active_speed = IB_SPEED_QDR;
441                 break;
442         case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR):
443                 *active_width = IB_WIDTH_1X;
444                 *active_speed = IB_SPEED_EDR;
445                 break;
446         case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2):
447                 *active_width = IB_WIDTH_2X;
448                 *active_speed = IB_SPEED_EDR;
449                 break;
450         case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR):
451                 *active_width = IB_WIDTH_1X;
452                 *active_speed = IB_SPEED_HDR;
453                 break;
454         case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4):
455                 *active_width = IB_WIDTH_4X;
456                 *active_speed = IB_SPEED_EDR;
457                 break;
458         case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2):
459                 *active_width = IB_WIDTH_2X;
460                 *active_speed = IB_SPEED_HDR;
461                 break;
462         case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4):
463                 *active_width = IB_WIDTH_4X;
464                 *active_speed = IB_SPEED_HDR;
465                 break;
466         default:
467                 return -EINVAL;
468         }
469
470         return 0;
471 }
472
473 static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
474                                     u8 *active_width, bool ext)
475 {
476         return ext ?
477                 translate_eth_ext_proto_oper(eth_proto_oper, active_speed,
478                                              active_width) :
479                 translate_eth_legacy_proto_oper(eth_proto_oper, active_speed,
480                                                 active_width);
481 }
482
483 static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
484                                 struct ib_port_attr *props)
485 {
486         struct mlx5_ib_dev *dev = to_mdev(device);
487         u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
488         struct mlx5_core_dev *mdev;
489         struct net_device *ndev, *upper;
490         enum ib_mtu ndev_ib_mtu;
491         bool put_mdev = true;
492         u16 qkey_viol_cntr;
493         u32 eth_prot_oper;
494         u8 mdev_port_num;
495         bool ext;
496         int err;
497
498         mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
499         if (!mdev) {
500                 /* This means the port isn't affiliated yet. Get the
501                  * info for the master port instead.
502                  */
503                 put_mdev = false;
504                 mdev = dev->mdev;
505                 mdev_port_num = 1;
506                 port_num = 1;
507         }
508
509         /* Possible bad flows are checked before filling out props so in case
510          * of an error it will still be zeroed out.
511          * Use native port in case of reps
512          */
513         if (dev->is_rep)
514                 err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
515                                            1);
516         else
517                 err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
518                                            mdev_port_num);
519         if (err)
520                 goto out;
521         ext = MLX5_CAP_PCAM_FEATURE(dev->mdev, ptys_extended_ethernet);
522         eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
523
524         props->active_width     = IB_WIDTH_4X;
525         props->active_speed     = IB_SPEED_QDR;
526
527         translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
528                                  &props->active_width, ext);
529
530         props->port_cap_flags |= IB_PORT_CM_SUP;
531         props->ip_gids = true;
532
533         props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
534                                                 roce_address_table_size);
535         props->max_mtu          = IB_MTU_4096;
536         props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
537         props->pkey_tbl_len     = 1;
538         props->state            = IB_PORT_DOWN;
539         props->phys_state       = IB_PORT_PHYS_STATE_DISABLED;
540
541         mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr);
542         props->qkey_viol_cntr = qkey_viol_cntr;
543
544         /* If this is a stub query for an unaffiliated port stop here */
545         if (!put_mdev)
546                 goto out;
547
548         ndev = mlx5_ib_get_netdev(device, port_num);
549         if (!ndev)
550                 goto out;
551
552         if (dev->lag_active) {
553                 rcu_read_lock();
554                 upper = netdev_master_upper_dev_get_rcu(ndev);
555                 if (upper) {
556                         dev_put(ndev);
557                         ndev = upper;
558                         dev_hold(ndev);
559                 }
560                 rcu_read_unlock();
561         }
562
563         if (netif_running(ndev) && netif_carrier_ok(ndev)) {
564                 props->state      = IB_PORT_ACTIVE;
565                 props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
566         }
567
568         ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
569
570         dev_put(ndev);
571
572         props->active_mtu       = min(props->max_mtu, ndev_ib_mtu);
573 out:
574         if (put_mdev)
575                 mlx5_ib_put_native_port_mdev(dev, port_num);
576         return err;
577 }
578
579 static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
580                          unsigned int index, const union ib_gid *gid,
581                          const struct ib_gid_attr *attr)
582 {
583         enum ib_gid_type gid_type = IB_GID_TYPE_IB;
584         u16 vlan_id = 0xffff;
585         u8 roce_version = 0;
586         u8 roce_l3_type = 0;
587         u8 mac[ETH_ALEN];
588         int ret;
589
590         if (gid) {
591                 gid_type = attr->gid_type;
592                 ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]);
593                 if (ret)
594                         return ret;
595         }
596
597         switch (gid_type) {
598         case IB_GID_TYPE_IB:
599                 roce_version = MLX5_ROCE_VERSION_1;
600                 break;
601         case IB_GID_TYPE_ROCE_UDP_ENCAP:
602                 roce_version = MLX5_ROCE_VERSION_2;
603                 if (ipv6_addr_v4mapped((void *)gid))
604                         roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4;
605                 else
606                         roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6;
607                 break;
608
609         default:
610                 mlx5_ib_warn(dev, "Unexpected GID type %u\n", gid_type);
611         }
612
613         return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
614                                       roce_l3_type, gid->raw, mac,
615                                       vlan_id < VLAN_CFI_MASK, vlan_id,
616                                       port_num);
617 }
618
619 static int mlx5_ib_add_gid(const struct ib_gid_attr *attr,
620                            __always_unused void **context)
621 {
622         return set_roce_addr(to_mdev(attr->device), attr->port_num,
623                              attr->index, &attr->gid, attr);
624 }
625
626 static int mlx5_ib_del_gid(const struct ib_gid_attr *attr,
627                            __always_unused void **context)
628 {
629         return set_roce_addr(to_mdev(attr->device), attr->port_num,
630                              attr->index, NULL, NULL);
631 }
632
633 __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev,
634                                const struct ib_gid_attr *attr)
635 {
636         if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
637                 return 0;
638
639         return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
640 }
641
642 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
643 {
644         if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
645                 return !MLX5_CAP_GEN(dev->mdev, ib_virt);
646         return 0;
647 }
648
649 enum {
650         MLX5_VPORT_ACCESS_METHOD_MAD,
651         MLX5_VPORT_ACCESS_METHOD_HCA,
652         MLX5_VPORT_ACCESS_METHOD_NIC,
653 };
654
655 static int mlx5_get_vport_access_method(struct ib_device *ibdev)
656 {
657         if (mlx5_use_mad_ifc(to_mdev(ibdev)))
658                 return MLX5_VPORT_ACCESS_METHOD_MAD;
659
660         if (mlx5_ib_port_link_layer(ibdev, 1) ==
661             IB_LINK_LAYER_ETHERNET)
662                 return MLX5_VPORT_ACCESS_METHOD_NIC;
663
664         return MLX5_VPORT_ACCESS_METHOD_HCA;
665 }
666
667 static void get_atomic_caps(struct mlx5_ib_dev *dev,
668                             u8 atomic_size_qp,
669                             struct ib_device_attr *props)
670 {
671         u8 tmp;
672         u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
673         u8 atomic_req_8B_endianness_mode =
674                 MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode);
675
676         /* Check if HW supports 8 bytes standard atomic operations and capable
677          * of host endianness respond
678          */
679         tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
680         if (((atomic_operations & tmp) == tmp) &&
681             (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
682             (atomic_req_8B_endianness_mode)) {
683                 props->atomic_cap = IB_ATOMIC_HCA;
684         } else {
685                 props->atomic_cap = IB_ATOMIC_NONE;
686         }
687 }
688
689 static void get_atomic_caps_qp(struct mlx5_ib_dev *dev,
690                                struct ib_device_attr *props)
691 {
692         u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
693
694         get_atomic_caps(dev, atomic_size_qp, props);
695 }
696
697 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
698                                         __be64 *sys_image_guid)
699 {
700         struct mlx5_ib_dev *dev = to_mdev(ibdev);
701         struct mlx5_core_dev *mdev = dev->mdev;
702         u64 tmp;
703         int err;
704
705         switch (mlx5_get_vport_access_method(ibdev)) {
706         case MLX5_VPORT_ACCESS_METHOD_MAD:
707                 return mlx5_query_mad_ifc_system_image_guid(ibdev,
708                                                             sys_image_guid);
709
710         case MLX5_VPORT_ACCESS_METHOD_HCA:
711                 err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
712                 break;
713
714         case MLX5_VPORT_ACCESS_METHOD_NIC:
715                 err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
716                 break;
717
718         default:
719                 return -EINVAL;
720         }
721
722         if (!err)
723                 *sys_image_guid = cpu_to_be64(tmp);
724
725         return err;
726
727 }
728
729 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
730                                 u16 *max_pkeys)
731 {
732         struct mlx5_ib_dev *dev = to_mdev(ibdev);
733         struct mlx5_core_dev *mdev = dev->mdev;
734
735         switch (mlx5_get_vport_access_method(ibdev)) {
736         case MLX5_VPORT_ACCESS_METHOD_MAD:
737                 return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
738
739         case MLX5_VPORT_ACCESS_METHOD_HCA:
740         case MLX5_VPORT_ACCESS_METHOD_NIC:
741                 *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
742                                                 pkey_table_size));
743                 return 0;
744
745         default:
746                 return -EINVAL;
747         }
748 }
749
750 static int mlx5_query_vendor_id(struct ib_device *ibdev,
751                                 u32 *vendor_id)
752 {
753         struct mlx5_ib_dev *dev = to_mdev(ibdev);
754
755         switch (mlx5_get_vport_access_method(ibdev)) {
756         case MLX5_VPORT_ACCESS_METHOD_MAD:
757                 return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
758
759         case MLX5_VPORT_ACCESS_METHOD_HCA:
760         case MLX5_VPORT_ACCESS_METHOD_NIC:
761                 return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
762
763         default:
764                 return -EINVAL;
765         }
766 }
767
768 static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
769                                 __be64 *node_guid)
770 {
771         u64 tmp;
772         int err;
773
774         switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
775         case MLX5_VPORT_ACCESS_METHOD_MAD:
776                 return mlx5_query_mad_ifc_node_guid(dev, node_guid);
777
778         case MLX5_VPORT_ACCESS_METHOD_HCA:
779                 err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
780                 break;
781
782         case MLX5_VPORT_ACCESS_METHOD_NIC:
783                 err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
784                 break;
785
786         default:
787                 return -EINVAL;
788         }
789
790         if (!err)
791                 *node_guid = cpu_to_be64(tmp);
792
793         return err;
794 }
795
796 struct mlx5_reg_node_desc {
797         u8      desc[IB_DEVICE_NODE_DESC_MAX];
798 };
799
800 static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
801 {
802         struct mlx5_reg_node_desc in;
803
804         if (mlx5_use_mad_ifc(dev))
805                 return mlx5_query_mad_ifc_node_desc(dev, node_desc);
806
807         memset(&in, 0, sizeof(in));
808
809         return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
810                                     sizeof(struct mlx5_reg_node_desc),
811                                     MLX5_REG_NODE_DESC, 0, 0);
812 }
813
814 static int mlx5_ib_query_device(struct ib_device *ibdev,
815                                 struct ib_device_attr *props,
816                                 struct ib_udata *uhw)
817 {
818         size_t uhw_outlen = (uhw) ? uhw->outlen : 0;
819         struct mlx5_ib_dev *dev = to_mdev(ibdev);
820         struct mlx5_core_dev *mdev = dev->mdev;
821         int err = -ENOMEM;
822         int max_sq_desc;
823         int max_rq_sg;
824         int max_sq_sg;
825         u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
826         bool raw_support = !mlx5_core_mp_enabled(mdev);
827         struct mlx5_ib_query_device_resp resp = {};
828         size_t resp_len;
829         u64 max_tso;
830
831         resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
832         if (uhw_outlen && uhw_outlen < resp_len)
833                 return -EINVAL;
834
835         resp.response_length = resp_len;
836
837         if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
838                 return -EINVAL;
839
840         memset(props, 0, sizeof(*props));
841         err = mlx5_query_system_image_guid(ibdev,
842                                            &props->sys_image_guid);
843         if (err)
844                 return err;
845
846         err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
847         if (err)
848                 return err;
849
850         err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
851         if (err)
852                 return err;
853
854         props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
855                 (fw_rev_min(dev->mdev) << 16) |
856                 fw_rev_sub(dev->mdev);
857         props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
858                 IB_DEVICE_PORT_ACTIVE_EVENT             |
859                 IB_DEVICE_SYS_IMAGE_GUID                |
860                 IB_DEVICE_RC_RNR_NAK_GEN;
861
862         if (MLX5_CAP_GEN(mdev, pkv))
863                 props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
864         if (MLX5_CAP_GEN(mdev, qkv))
865                 props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
866         if (MLX5_CAP_GEN(mdev, apm))
867                 props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
868         if (MLX5_CAP_GEN(mdev, xrc))
869                 props->device_cap_flags |= IB_DEVICE_XRC;
870         if (MLX5_CAP_GEN(mdev, imaicl)) {
871                 props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
872                                            IB_DEVICE_MEM_WINDOW_TYPE_2B;
873                 props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
874                 /* We support 'Gappy' memory registration too */
875                 props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
876         }
877         props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
878         if (MLX5_CAP_GEN(mdev, sho)) {
879                 props->device_cap_flags |= IB_DEVICE_INTEGRITY_HANDOVER;
880                 /* At this stage no support for signature handover */
881                 props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
882                                       IB_PROT_T10DIF_TYPE_2 |
883                                       IB_PROT_T10DIF_TYPE_3;
884                 props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
885                                        IB_GUARD_T10DIF_CSUM;
886         }
887         if (MLX5_CAP_GEN(mdev, block_lb_mc))
888                 props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
889
890         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) {
891                 if (MLX5_CAP_ETH(mdev, csum_cap)) {
892                         /* Legacy bit to support old userspace libraries */
893                         props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
894                         props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
895                 }
896
897                 if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
898                         props->raw_packet_caps |=
899                                 IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
900
901                 if (field_avail(typeof(resp), tso_caps, uhw_outlen)) {
902                         max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
903                         if (max_tso) {
904                                 resp.tso_caps.max_tso = 1 << max_tso;
905                                 resp.tso_caps.supported_qpts |=
906                                         1 << IB_QPT_RAW_PACKET;
907                                 resp.response_length += sizeof(resp.tso_caps);
908                         }
909                 }
910
911                 if (field_avail(typeof(resp), rss_caps, uhw_outlen)) {
912                         resp.rss_caps.rx_hash_function =
913                                                 MLX5_RX_HASH_FUNC_TOEPLITZ;
914                         resp.rss_caps.rx_hash_fields_mask =
915                                                 MLX5_RX_HASH_SRC_IPV4 |
916                                                 MLX5_RX_HASH_DST_IPV4 |
917                                                 MLX5_RX_HASH_SRC_IPV6 |
918                                                 MLX5_RX_HASH_DST_IPV6 |
919                                                 MLX5_RX_HASH_SRC_PORT_TCP |
920                                                 MLX5_RX_HASH_DST_PORT_TCP |
921                                                 MLX5_RX_HASH_SRC_PORT_UDP |
922                                                 MLX5_RX_HASH_DST_PORT_UDP |
923                                                 MLX5_RX_HASH_INNER;
924                         if (mlx5_accel_ipsec_device_caps(dev->mdev) &
925                             MLX5_ACCEL_IPSEC_CAP_DEVICE)
926                                 resp.rss_caps.rx_hash_fields_mask |=
927                                         MLX5_RX_HASH_IPSEC_SPI;
928                         resp.response_length += sizeof(resp.rss_caps);
929                 }
930         } else {
931                 if (field_avail(typeof(resp), tso_caps, uhw_outlen))
932                         resp.response_length += sizeof(resp.tso_caps);
933                 if (field_avail(typeof(resp), rss_caps, uhw_outlen))
934                         resp.response_length += sizeof(resp.rss_caps);
935         }
936
937         if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
938                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
939                 props->device_cap_flags |= IB_DEVICE_UD_TSO;
940         }
941
942         if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
943             MLX5_CAP_GEN(dev->mdev, general_notification_event) &&
944             raw_support)
945                 props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
946
947         if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
948             MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap))
949                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
950
951         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
952             MLX5_CAP_ETH(dev->mdev, scatter_fcs) &&
953             raw_support) {
954                 /* Legacy bit to support old userspace libraries */
955                 props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
956                 props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
957         }
958
959         if (MLX5_CAP_DEV_MEM(mdev, memic)) {
960                 props->max_dm_size =
961                         MLX5_CAP_DEV_MEM(mdev, max_memic_size);
962         }
963
964         if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
965                 props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
966
967         if (MLX5_CAP_GEN(mdev, end_pad))
968                 props->device_cap_flags |= IB_DEVICE_PCI_WRITE_END_PADDING;
969
970         props->vendor_part_id      = mdev->pdev->device;
971         props->hw_ver              = mdev->pdev->revision;
972
973         props->max_mr_size         = ~0ull;
974         props->page_size_cap       = ~(min_page_size - 1);
975         props->max_qp              = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
976         props->max_qp_wr           = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
977         max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
978                      sizeof(struct mlx5_wqe_data_seg);
979         max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
980         max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
981                      sizeof(struct mlx5_wqe_raddr_seg)) /
982                 sizeof(struct mlx5_wqe_data_seg);
983         props->max_send_sge = max_sq_sg;
984         props->max_recv_sge = max_rq_sg;
985         props->max_sge_rd          = MLX5_MAX_SGE_RD;
986         props->max_cq              = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
987         props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
988         props->max_mr              = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
989         props->max_pd              = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
990         props->max_qp_rd_atom      = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
991         props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
992         props->max_srq             = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
993         props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
994         props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
995         props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
996         props->max_srq_sge         = max_rq_sg - 1;
997         props->max_fast_reg_page_list_len =
998                 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
999         props->max_pi_fast_reg_page_list_len =
1000                 props->max_fast_reg_page_list_len / 2;
1001         props->max_sgl_rd =
1002                 MLX5_CAP_GEN(mdev, max_sgl_for_optimized_performance);
1003         get_atomic_caps_qp(dev, props);
1004         props->masked_atomic_cap   = IB_ATOMIC_NONE;
1005         props->max_mcast_grp       = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
1006         props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
1007         props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
1008                                            props->max_mcast_grp;
1009         props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
1010         props->max_ah = INT_MAX;
1011         props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
1012         props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
1013
1014         if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
1015                 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
1016                         props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
1017                 props->odp_caps = dev->odp_caps;
1018                 if (!uhw) {
1019                         /* ODP for kernel QPs is not implemented for receive
1020                          * WQEs and SRQ WQEs
1021                          */
1022                         props->odp_caps.per_transport_caps.rc_odp_caps &=
1023                                 ~(IB_ODP_SUPPORT_READ |
1024                                   IB_ODP_SUPPORT_SRQ_RECV);
1025                         props->odp_caps.per_transport_caps.uc_odp_caps &=
1026                                 ~(IB_ODP_SUPPORT_READ |
1027                                   IB_ODP_SUPPORT_SRQ_RECV);
1028                         props->odp_caps.per_transport_caps.ud_odp_caps &=
1029                                 ~(IB_ODP_SUPPORT_READ |
1030                                   IB_ODP_SUPPORT_SRQ_RECV);
1031                         props->odp_caps.per_transport_caps.xrc_odp_caps &=
1032                                 ~(IB_ODP_SUPPORT_READ |
1033                                   IB_ODP_SUPPORT_SRQ_RECV);
1034                 }
1035         }
1036
1037         if (MLX5_CAP_GEN(mdev, cd))
1038                 props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
1039
1040         if (mlx5_core_is_vf(mdev))
1041                 props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
1042
1043         if (mlx5_ib_port_link_layer(ibdev, 1) ==
1044             IB_LINK_LAYER_ETHERNET && raw_support) {
1045                 props->rss_caps.max_rwq_indirection_tables =
1046                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
1047                 props->rss_caps.max_rwq_indirection_table_size =
1048                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
1049                 props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
1050                 props->max_wq_type_rq =
1051                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
1052         }
1053
1054         if (MLX5_CAP_GEN(mdev, tag_matching)) {
1055                 props->tm_caps.max_num_tags =
1056                         (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
1057                 props->tm_caps.max_ops =
1058                         1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
1059                 props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
1060         }
1061
1062         if (MLX5_CAP_GEN(mdev, tag_matching) &&
1063             MLX5_CAP_GEN(mdev, rndv_offload_rc)) {
1064                 props->tm_caps.flags = IB_TM_CAP_RNDV_RC;
1065                 props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
1066         }
1067
1068         if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
1069                 props->cq_caps.max_cq_moderation_count =
1070                                                 MLX5_MAX_CQ_COUNT;
1071                 props->cq_caps.max_cq_moderation_period =
1072                                                 MLX5_MAX_CQ_PERIOD;
1073         }
1074
1075         if (field_avail(typeof(resp), cqe_comp_caps, uhw_outlen)) {
1076                 resp.response_length += sizeof(resp.cqe_comp_caps);
1077
1078                 if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
1079                         resp.cqe_comp_caps.max_num =
1080                                 MLX5_CAP_GEN(dev->mdev,
1081                                              cqe_compression_max_num);
1082
1083                         resp.cqe_comp_caps.supported_format =
1084                                 MLX5_IB_CQE_RES_FORMAT_HASH |
1085                                 MLX5_IB_CQE_RES_FORMAT_CSUM;
1086
1087                         if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index))
1088                                 resp.cqe_comp_caps.supported_format |=
1089                                         MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX;
1090                 }
1091         }
1092
1093         if (field_avail(typeof(resp), packet_pacing_caps, uhw_outlen) &&
1094             raw_support) {
1095                 if (MLX5_CAP_QOS(mdev, packet_pacing) &&
1096                     MLX5_CAP_GEN(mdev, qos)) {
1097                         resp.packet_pacing_caps.qp_rate_limit_max =
1098                                 MLX5_CAP_QOS(mdev, packet_pacing_max_rate);
1099                         resp.packet_pacing_caps.qp_rate_limit_min =
1100                                 MLX5_CAP_QOS(mdev, packet_pacing_min_rate);
1101                         resp.packet_pacing_caps.supported_qpts |=
1102                                 1 << IB_QPT_RAW_PACKET;
1103                         if (MLX5_CAP_QOS(mdev, packet_pacing_burst_bound) &&
1104                             MLX5_CAP_QOS(mdev, packet_pacing_typical_size))
1105                                 resp.packet_pacing_caps.cap_flags |=
1106                                         MLX5_IB_PP_SUPPORT_BURST;
1107                 }
1108                 resp.response_length += sizeof(resp.packet_pacing_caps);
1109         }
1110
1111         if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
1112                         uhw_outlen)) {
1113                 if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
1114                         resp.mlx5_ib_support_multi_pkt_send_wqes =
1115                                 MLX5_IB_ALLOW_MPW;
1116
1117                 if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe))
1118                         resp.mlx5_ib_support_multi_pkt_send_wqes |=
1119                                 MLX5_IB_SUPPORT_EMPW;
1120
1121                 resp.response_length +=
1122                         sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
1123         }
1124
1125         if (field_avail(typeof(resp), flags, uhw_outlen)) {
1126                 resp.response_length += sizeof(resp.flags);
1127
1128                 if (MLX5_CAP_GEN(mdev, cqe_compression_128))
1129                         resp.flags |=
1130                                 MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP;
1131
1132                 if (MLX5_CAP_GEN(mdev, cqe_128_always))
1133                         resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
1134                 if (MLX5_CAP_GEN(mdev, qp_packet_based))
1135                         resp.flags |=
1136                                 MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
1137
1138                 resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
1139         }
1140
1141         if (field_avail(typeof(resp), sw_parsing_caps, uhw_outlen)) {
1142                 resp.response_length += sizeof(resp.sw_parsing_caps);
1143                 if (MLX5_CAP_ETH(mdev, swp)) {
1144                         resp.sw_parsing_caps.sw_parsing_offloads |=
1145                                 MLX5_IB_SW_PARSING;
1146
1147                         if (MLX5_CAP_ETH(mdev, swp_csum))
1148                                 resp.sw_parsing_caps.sw_parsing_offloads |=
1149                                         MLX5_IB_SW_PARSING_CSUM;
1150
1151                         if (MLX5_CAP_ETH(mdev, swp_lso))
1152                                 resp.sw_parsing_caps.sw_parsing_offloads |=
1153                                         MLX5_IB_SW_PARSING_LSO;
1154
1155                         if (resp.sw_parsing_caps.sw_parsing_offloads)
1156                                 resp.sw_parsing_caps.supported_qpts =
1157                                         BIT(IB_QPT_RAW_PACKET);
1158                 }
1159         }
1160
1161         if (field_avail(typeof(resp), striding_rq_caps, uhw_outlen) &&
1162             raw_support) {
1163                 resp.response_length += sizeof(resp.striding_rq_caps);
1164                 if (MLX5_CAP_GEN(mdev, striding_rq)) {
1165                         resp.striding_rq_caps.min_single_stride_log_num_of_bytes =
1166                                 MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
1167                         resp.striding_rq_caps.max_single_stride_log_num_of_bytes =
1168                                 MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES;
1169                         if (MLX5_CAP_GEN(dev->mdev, ext_stride_num_range))
1170                                 resp.striding_rq_caps
1171                                         .min_single_wqe_log_num_of_strides =
1172                                         MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
1173                         else
1174                                 resp.striding_rq_caps
1175                                         .min_single_wqe_log_num_of_strides =
1176                                         MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
1177                         resp.striding_rq_caps.max_single_wqe_log_num_of_strides =
1178                                 MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES;
1179                         resp.striding_rq_caps.supported_qpts =
1180                                 BIT(IB_QPT_RAW_PACKET);
1181                 }
1182         }
1183
1184         if (field_avail(typeof(resp), tunnel_offloads_caps, uhw_outlen)) {
1185                 resp.response_length += sizeof(resp.tunnel_offloads_caps);
1186                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
1187                         resp.tunnel_offloads_caps |=
1188                                 MLX5_IB_TUNNELED_OFFLOADS_VXLAN;
1189                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx))
1190                         resp.tunnel_offloads_caps |=
1191                                 MLX5_IB_TUNNELED_OFFLOADS_GENEVE;
1192                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre))
1193                         resp.tunnel_offloads_caps |=
1194                                 MLX5_IB_TUNNELED_OFFLOADS_GRE;
1195                 if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
1196                     MLX5_FLEX_PROTO_CW_MPLS_GRE)
1197                         resp.tunnel_offloads_caps |=
1198                                 MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE;
1199                 if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
1200                     MLX5_FLEX_PROTO_CW_MPLS_UDP)
1201                         resp.tunnel_offloads_caps |=
1202                                 MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
1203         }
1204
1205         if (uhw_outlen) {
1206                 err = ib_copy_to_udata(uhw, &resp, resp.response_length);
1207
1208                 if (err)
1209                         return err;
1210         }
1211
1212         return 0;
1213 }
1214
1215 enum mlx5_ib_width {
1216         MLX5_IB_WIDTH_1X        = 1 << 0,
1217         MLX5_IB_WIDTH_2X        = 1 << 1,
1218         MLX5_IB_WIDTH_4X        = 1 << 2,
1219         MLX5_IB_WIDTH_8X        = 1 << 3,
1220         MLX5_IB_WIDTH_12X       = 1 << 4
1221 };
1222
1223 static void translate_active_width(struct ib_device *ibdev, u8 active_width,
1224                                   u8 *ib_width)
1225 {
1226         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1227
1228         if (active_width & MLX5_IB_WIDTH_1X)
1229                 *ib_width = IB_WIDTH_1X;
1230         else if (active_width & MLX5_IB_WIDTH_2X)
1231                 *ib_width = IB_WIDTH_2X;
1232         else if (active_width & MLX5_IB_WIDTH_4X)
1233                 *ib_width = IB_WIDTH_4X;
1234         else if (active_width & MLX5_IB_WIDTH_8X)
1235                 *ib_width = IB_WIDTH_8X;
1236         else if (active_width & MLX5_IB_WIDTH_12X)
1237                 *ib_width = IB_WIDTH_12X;
1238         else {
1239                 mlx5_ib_dbg(dev, "Invalid active_width %d, setting width to default value: 4x\n",
1240                             (int)active_width);
1241                 *ib_width = IB_WIDTH_4X;
1242         }
1243
1244         return;
1245 }
1246
1247 static int mlx5_mtu_to_ib_mtu(int mtu)
1248 {
1249         switch (mtu) {
1250         case 256: return 1;
1251         case 512: return 2;
1252         case 1024: return 3;
1253         case 2048: return 4;
1254         case 4096: return 5;
1255         default:
1256                 pr_warn("invalid mtu\n");
1257                 return -1;
1258         }
1259 }
1260
1261 enum ib_max_vl_num {
1262         __IB_MAX_VL_0           = 1,
1263         __IB_MAX_VL_0_1         = 2,
1264         __IB_MAX_VL_0_3         = 3,
1265         __IB_MAX_VL_0_7         = 4,
1266         __IB_MAX_VL_0_14        = 5,
1267 };
1268
1269 enum mlx5_vl_hw_cap {
1270         MLX5_VL_HW_0    = 1,
1271         MLX5_VL_HW_0_1  = 2,
1272         MLX5_VL_HW_0_2  = 3,
1273         MLX5_VL_HW_0_3  = 4,
1274         MLX5_VL_HW_0_4  = 5,
1275         MLX5_VL_HW_0_5  = 6,
1276         MLX5_VL_HW_0_6  = 7,
1277         MLX5_VL_HW_0_7  = 8,
1278         MLX5_VL_HW_0_14 = 15
1279 };
1280
1281 static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
1282                                 u8 *max_vl_num)
1283 {
1284         switch (vl_hw_cap) {
1285         case MLX5_VL_HW_0:
1286                 *max_vl_num = __IB_MAX_VL_0;
1287                 break;
1288         case MLX5_VL_HW_0_1:
1289                 *max_vl_num = __IB_MAX_VL_0_1;
1290                 break;
1291         case MLX5_VL_HW_0_3:
1292                 *max_vl_num = __IB_MAX_VL_0_3;
1293                 break;
1294         case MLX5_VL_HW_0_7:
1295                 *max_vl_num = __IB_MAX_VL_0_7;
1296                 break;
1297         case MLX5_VL_HW_0_14:
1298                 *max_vl_num = __IB_MAX_VL_0_14;
1299                 break;
1300
1301         default:
1302                 return -EINVAL;
1303         }
1304
1305         return 0;
1306 }
1307
1308 static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
1309                                struct ib_port_attr *props)
1310 {
1311         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1312         struct mlx5_core_dev *mdev = dev->mdev;
1313         struct mlx5_hca_vport_context *rep;
1314         u16 max_mtu;
1315         u16 oper_mtu;
1316         int err;
1317         u8 ib_link_width_oper;
1318         u8 vl_hw_cap;
1319
1320         rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1321         if (!rep) {
1322                 err = -ENOMEM;
1323                 goto out;
1324         }
1325
1326         /* props being zeroed by the caller, avoid zeroing it here */
1327
1328         err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
1329         if (err)
1330                 goto out;
1331
1332         props->lid              = rep->lid;
1333         props->lmc              = rep->lmc;
1334         props->sm_lid           = rep->sm_lid;
1335         props->sm_sl            = rep->sm_sl;
1336         props->state            = rep->vport_state;
1337         props->phys_state       = rep->port_physical_state;
1338         props->port_cap_flags   = rep->cap_mask1;
1339         props->gid_tbl_len      = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
1340         props->max_msg_sz       = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
1341         props->pkey_tbl_len     = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
1342         props->bad_pkey_cntr    = rep->pkey_violation_counter;
1343         props->qkey_viol_cntr   = rep->qkey_violation_counter;
1344         props->subnet_timeout   = rep->subnet_timeout;
1345         props->init_type_reply  = rep->init_type_reply;
1346
1347         if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP)
1348                 props->port_cap_flags2 = rep->cap_mask2;
1349
1350         err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
1351         if (err)
1352                 goto out;
1353
1354         translate_active_width(ibdev, ib_link_width_oper, &props->active_width);
1355
1356         err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port);
1357         if (err)
1358                 goto out;
1359
1360         mlx5_query_port_max_mtu(mdev, &max_mtu, port);
1361
1362         props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
1363
1364         mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
1365
1366         props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
1367
1368         err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
1369         if (err)
1370                 goto out;
1371
1372         err = translate_max_vl_num(ibdev, vl_hw_cap,
1373                                    &props->max_vl_num);
1374 out:
1375         kfree(rep);
1376         return err;
1377 }
1378
1379 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
1380                        struct ib_port_attr *props)
1381 {
1382         unsigned int count;
1383         int ret;
1384
1385         switch (mlx5_get_vport_access_method(ibdev)) {
1386         case MLX5_VPORT_ACCESS_METHOD_MAD:
1387                 ret = mlx5_query_mad_ifc_port(ibdev, port, props);
1388                 break;
1389
1390         case MLX5_VPORT_ACCESS_METHOD_HCA:
1391                 ret = mlx5_query_hca_port(ibdev, port, props);
1392                 break;
1393
1394         case MLX5_VPORT_ACCESS_METHOD_NIC:
1395                 ret = mlx5_query_port_roce(ibdev, port, props);
1396                 break;
1397
1398         default:
1399                 ret = -EINVAL;
1400         }
1401
1402         if (!ret && props) {
1403                 struct mlx5_ib_dev *dev = to_mdev(ibdev);
1404                 struct mlx5_core_dev *mdev;
1405                 bool put_mdev = true;
1406
1407                 mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL);
1408                 if (!mdev) {
1409                         /* If the port isn't affiliated yet query the master.
1410                          * The master and slave will have the same values.
1411                          */
1412                         mdev = dev->mdev;
1413                         port = 1;
1414                         put_mdev = false;
1415                 }
1416                 count = mlx5_core_reserved_gids_count(mdev);
1417                 if (put_mdev)
1418                         mlx5_ib_put_native_port_mdev(dev, port);
1419                 props->gid_tbl_len -= count;
1420         }
1421         return ret;
1422 }
1423
1424 static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u8 port,
1425                                   struct ib_port_attr *props)
1426 {
1427         int ret;
1428
1429         /* Only link layer == ethernet is valid for representors
1430          * and we always use port 1
1431          */
1432         ret = mlx5_query_port_roce(ibdev, port, props);
1433         if (ret || !props)
1434                 return ret;
1435
1436         /* We don't support GIDS */
1437         props->gid_tbl_len = 0;
1438
1439         return ret;
1440 }
1441
1442 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
1443                              union ib_gid *gid)
1444 {
1445         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1446         struct mlx5_core_dev *mdev = dev->mdev;
1447
1448         switch (mlx5_get_vport_access_method(ibdev)) {
1449         case MLX5_VPORT_ACCESS_METHOD_MAD:
1450                 return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
1451
1452         case MLX5_VPORT_ACCESS_METHOD_HCA:
1453                 return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
1454
1455         default:
1456                 return -EINVAL;
1457         }
1458
1459 }
1460
1461 static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port,
1462                                    u16 index, u16 *pkey)
1463 {
1464         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1465         struct mlx5_core_dev *mdev;
1466         bool put_mdev = true;
1467         u8 mdev_port_num;
1468         int err;
1469
1470         mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num);
1471         if (!mdev) {
1472                 /* The port isn't affiliated yet, get the PKey from the master
1473                  * port. For RoCE the PKey tables will be the same.
1474                  */
1475                 put_mdev = false;
1476                 mdev = dev->mdev;
1477                 mdev_port_num = 1;
1478         }
1479
1480         err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0,
1481                                         index, pkey);
1482         if (put_mdev)
1483                 mlx5_ib_put_native_port_mdev(dev, port);
1484
1485         return err;
1486 }
1487
1488 static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
1489                               u16 *pkey)
1490 {
1491         switch (mlx5_get_vport_access_method(ibdev)) {
1492         case MLX5_VPORT_ACCESS_METHOD_MAD:
1493                 return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
1494
1495         case MLX5_VPORT_ACCESS_METHOD_HCA:
1496         case MLX5_VPORT_ACCESS_METHOD_NIC:
1497                 return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey);
1498         default:
1499                 return -EINVAL;
1500         }
1501 }
1502
1503 static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
1504                                  struct ib_device_modify *props)
1505 {
1506         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1507         struct mlx5_reg_node_desc in;
1508         struct mlx5_reg_node_desc out;
1509         int err;
1510
1511         if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
1512                 return -EOPNOTSUPP;
1513
1514         if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
1515                 return 0;
1516
1517         /*
1518          * If possible, pass node desc to FW, so it can generate
1519          * a 144 trap.  If cmd fails, just ignore.
1520          */
1521         memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1522         err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
1523                                    sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
1524         if (err)
1525                 return err;
1526
1527         memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1528
1529         return err;
1530 }
1531
1532 static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask,
1533                                 u32 value)
1534 {
1535         struct mlx5_hca_vport_context ctx = {};
1536         struct mlx5_core_dev *mdev;
1537         u8 mdev_port_num;
1538         int err;
1539
1540         mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
1541         if (!mdev)
1542                 return -ENODEV;
1543
1544         err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx);
1545         if (err)
1546                 goto out;
1547
1548         if (~ctx.cap_mask1_perm & mask) {
1549                 mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
1550                              mask, ctx.cap_mask1_perm);
1551                 err = -EINVAL;
1552                 goto out;
1553         }
1554
1555         ctx.cap_mask1 = value;
1556         ctx.cap_mask1_perm = mask;
1557         err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num,
1558                                                  0, &ctx);
1559
1560 out:
1561         mlx5_ib_put_native_port_mdev(dev, port_num);
1562
1563         return err;
1564 }
1565
1566 static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
1567                                struct ib_port_modify *props)
1568 {
1569         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1570         struct ib_port_attr attr;
1571         u32 tmp;
1572         int err;
1573         u32 change_mask;
1574         u32 value;
1575         bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
1576                       IB_LINK_LAYER_INFINIBAND);
1577
1578         /* CM layer calls ib_modify_port() regardless of the link layer. For
1579          * Ethernet ports, qkey violation and Port capabilities are meaningless.
1580          */
1581         if (!is_ib)
1582                 return 0;
1583
1584         if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
1585                 change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
1586                 value = ~props->clr_port_cap_mask | props->set_port_cap_mask;
1587                 return set_port_caps_atomic(dev, port, change_mask, value);
1588         }
1589
1590         mutex_lock(&dev->cap_mask_mutex);
1591
1592         err = ib_query_port(ibdev, port, &attr);
1593         if (err)
1594                 goto out;
1595
1596         tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
1597                 ~props->clr_port_cap_mask;
1598
1599         err = mlx5_set_port_caps(dev->mdev, port, tmp);
1600
1601 out:
1602         mutex_unlock(&dev->cap_mask_mutex);
1603         return err;
1604 }
1605
1606 static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
1607 {
1608         mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
1609                     caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
1610 }
1611
1612 static u16 calc_dynamic_bfregs(int uars_per_sys_page)
1613 {
1614         /* Large page with non 4k uar support might limit the dynamic size */
1615         if (uars_per_sys_page == 1  && PAGE_SIZE > 4096)
1616                 return MLX5_MIN_DYN_BFREGS;
1617
1618         return MLX5_MAX_DYN_BFREGS;
1619 }
1620
1621 static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
1622                              struct mlx5_ib_alloc_ucontext_req_v2 *req,
1623                              struct mlx5_bfreg_info *bfregi)
1624 {
1625         int uars_per_sys_page;
1626         int bfregs_per_sys_page;
1627         int ref_bfregs = req->total_num_bfregs;
1628
1629         if (req->total_num_bfregs == 0)
1630                 return -EINVAL;
1631
1632         BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
1633         BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
1634
1635         if (req->total_num_bfregs > MLX5_MAX_BFREGS)
1636                 return -ENOMEM;
1637
1638         uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
1639         bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
1640         /* This holds the required static allocation asked by the user */
1641         req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
1642         if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
1643                 return -EINVAL;
1644
1645         bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
1646         bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page);
1647         bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs;
1648         bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page;
1649
1650         mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n",
1651                     MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
1652                     lib_uar_4k ? "yes" : "no", ref_bfregs,
1653                     req->total_num_bfregs, bfregi->total_num_bfregs,
1654                     bfregi->num_sys_pages);
1655
1656         return 0;
1657 }
1658
1659 static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1660 {
1661         struct mlx5_bfreg_info *bfregi;
1662         int err;
1663         int i;
1664
1665         bfregi = &context->bfregi;
1666         for (i = 0; i < bfregi->num_static_sys_pages; i++) {
1667                 err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
1668                 if (err)
1669                         goto error;
1670
1671                 mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
1672         }
1673
1674         for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++)
1675                 bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX;
1676
1677         return 0;
1678
1679 error:
1680         for (--i; i >= 0; i--)
1681                 if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]))
1682                         mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1683
1684         return err;
1685 }
1686
1687 static void deallocate_uars(struct mlx5_ib_dev *dev,
1688                             struct mlx5_ib_ucontext *context)
1689 {
1690         struct mlx5_bfreg_info *bfregi;
1691         int i;
1692
1693         bfregi = &context->bfregi;
1694         for (i = 0; i < bfregi->num_sys_pages; i++)
1695                 if (i < bfregi->num_static_sys_pages ||
1696                     bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX)
1697                         mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
1698 }
1699
1700 int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1701 {
1702         int err = 0;
1703
1704         mutex_lock(&dev->lb.mutex);
1705         if (td)
1706                 dev->lb.user_td++;
1707         if (qp)
1708                 dev->lb.qps++;
1709
1710         if (dev->lb.user_td == 2 ||
1711             dev->lb.qps == 1) {
1712                 if (!dev->lb.enabled) {
1713                         err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
1714                         dev->lb.enabled = true;
1715                 }
1716         }
1717
1718         mutex_unlock(&dev->lb.mutex);
1719
1720         return err;
1721 }
1722
1723 void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1724 {
1725         mutex_lock(&dev->lb.mutex);
1726         if (td)
1727                 dev->lb.user_td--;
1728         if (qp)
1729                 dev->lb.qps--;
1730
1731         if (dev->lb.user_td == 1 &&
1732             dev->lb.qps == 0) {
1733                 if (dev->lb.enabled) {
1734                         mlx5_nic_vport_update_local_lb(dev->mdev, false);
1735                         dev->lb.enabled = false;
1736                 }
1737         }
1738
1739         mutex_unlock(&dev->lb.mutex);
1740 }
1741
1742 static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn,
1743                                           u16 uid)
1744 {
1745         int err;
1746
1747         if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1748                 return 0;
1749
1750         err = mlx5_cmd_alloc_transport_domain(dev->mdev, tdn, uid);
1751         if (err)
1752                 return err;
1753
1754         if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1755             (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1756              !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1757                 return err;
1758
1759         return mlx5_ib_enable_lb(dev, true, false);
1760 }
1761
1762 static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
1763                                              u16 uid)
1764 {
1765         if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1766                 return;
1767
1768         mlx5_cmd_dealloc_transport_domain(dev->mdev, tdn, uid);
1769
1770         if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1771             (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1772              !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1773                 return;
1774
1775         mlx5_ib_disable_lb(dev, true, false);
1776 }
1777
1778 static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
1779                                   struct ib_udata *udata)
1780 {
1781         struct ib_device *ibdev = uctx->device;
1782         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1783         struct mlx5_ib_alloc_ucontext_req_v2 req = {};
1784         struct mlx5_ib_alloc_ucontext_resp resp = {};
1785         struct mlx5_core_dev *mdev = dev->mdev;
1786         struct mlx5_ib_ucontext *context = to_mucontext(uctx);
1787         struct mlx5_bfreg_info *bfregi;
1788         int ver;
1789         int err;
1790         size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
1791                                      max_cqe_version);
1792         u32 dump_fill_mkey;
1793         bool lib_uar_4k;
1794
1795         if (!dev->ib_active)
1796                 return -EAGAIN;
1797
1798         if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
1799                 ver = 0;
1800         else if (udata->inlen >= min_req_v2)
1801                 ver = 2;
1802         else
1803                 return -EINVAL;
1804
1805         err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1806         if (err)
1807                 return err;
1808
1809         if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
1810                 return -EOPNOTSUPP;
1811
1812         if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1813                 return -EOPNOTSUPP;
1814
1815         req.total_num_bfregs = ALIGN(req.total_num_bfregs,
1816                                     MLX5_NON_FP_BFREGS_PER_UAR);
1817         if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
1818                 return -EINVAL;
1819
1820         resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1821         if (dev->wc_support)
1822                 resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
1823         resp.cache_line_size = cache_line_size();
1824         resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1825         resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1826         resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1827         resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1828         resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1829         resp.cqe_version = min_t(__u8,
1830                                  (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
1831                                  req.max_cqe_version);
1832         resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1833                                 MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
1834         resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1835                                         MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
1836         resp.response_length = min(offsetof(typeof(resp), response_length) +
1837                                    sizeof(resp.response_length), udata->outlen);
1838
1839         if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) {
1840                 if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_EGRESS))
1841                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM;
1842                 if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA)
1843                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA;
1844                 if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
1845                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING;
1846                 if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN)
1847                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN;
1848                 /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
1849         }
1850
1851         lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
1852         bfregi = &context->bfregi;
1853
1854         /* updates req->total_num_bfregs */
1855         err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
1856         if (err)
1857                 goto out_ctx;
1858
1859         mutex_init(&bfregi->lock);
1860         bfregi->lib_uar_4k = lib_uar_4k;
1861         bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count),
1862                                 GFP_KERNEL);
1863         if (!bfregi->count) {
1864                 err = -ENOMEM;
1865                 goto out_ctx;
1866         }
1867
1868         bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
1869                                     sizeof(*bfregi->sys_pages),
1870                                     GFP_KERNEL);
1871         if (!bfregi->sys_pages) {
1872                 err = -ENOMEM;
1873                 goto out_count;
1874         }
1875
1876         err = allocate_uars(dev, context);
1877         if (err)
1878                 goto out_sys_pages;
1879
1880         if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
1881                 err = mlx5_ib_devx_create(dev, true);
1882                 if (err < 0)
1883                         goto out_uars;
1884                 context->devx_uid = err;
1885         }
1886
1887         err = mlx5_ib_alloc_transport_domain(dev, &context->tdn,
1888                                              context->devx_uid);
1889         if (err)
1890                 goto out_devx;
1891
1892         if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
1893                 err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey);
1894                 if (err)
1895                         goto out_mdev;
1896         }
1897
1898         INIT_LIST_HEAD(&context->db_page_list);
1899         mutex_init(&context->db_page_mutex);
1900
1901         resp.tot_bfregs = req.total_num_bfregs;
1902         resp.num_ports = dev->num_ports;
1903
1904         if (field_avail(typeof(resp), cqe_version, udata->outlen))
1905                 resp.response_length += sizeof(resp.cqe_version);
1906
1907         if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
1908                 resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
1909                                       MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
1910                 resp.response_length += sizeof(resp.cmds_supp_uhw);
1911         }
1912
1913         if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) {
1914                 if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
1915                         mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
1916                         resp.eth_min_inline++;
1917                 }
1918                 resp.response_length += sizeof(resp.eth_min_inline);
1919         }
1920
1921         if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) {
1922                 if (mdev->clock_info)
1923                         resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
1924                 resp.response_length += sizeof(resp.clock_info_versions);
1925         }
1926
1927         /*
1928          * We don't want to expose information from the PCI bar that is located
1929          * after 4096 bytes, so if the arch only supports larger pages, let's
1930          * pretend we don't support reading the HCA's core clock. This is also
1931          * forced by mmap function.
1932          */
1933         if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
1934                 if (PAGE_SIZE <= 4096) {
1935                         resp.comp_mask |=
1936                                 MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1937                         resp.hca_core_clock_offset =
1938                                 offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
1939                 }
1940                 resp.response_length += sizeof(resp.hca_core_clock_offset);
1941         }
1942
1943         if (field_avail(typeof(resp), log_uar_size, udata->outlen))
1944                 resp.response_length += sizeof(resp.log_uar_size);
1945
1946         if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
1947                 resp.response_length += sizeof(resp.num_uars_per_page);
1948
1949         if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) {
1950                 resp.num_dyn_bfregs = bfregi->num_dyn_bfregs;
1951                 resp.response_length += sizeof(resp.num_dyn_bfregs);
1952         }
1953
1954         if (field_avail(typeof(resp), dump_fill_mkey, udata->outlen)) {
1955                 if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
1956                         resp.dump_fill_mkey = dump_fill_mkey;
1957                         resp.comp_mask |=
1958                                 MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY;
1959                 }
1960                 resp.response_length += sizeof(resp.dump_fill_mkey);
1961         }
1962
1963         err = ib_copy_to_udata(udata, &resp, resp.response_length);
1964         if (err)
1965                 goto out_mdev;
1966
1967         bfregi->ver = ver;
1968         bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
1969         context->cqe_version = resp.cqe_version;
1970         context->lib_caps = req.lib_caps;
1971         print_lib_caps(dev, context->lib_caps);
1972
1973         if (dev->lag_active) {
1974                 u8 port = mlx5_core_native_port_num(dev->mdev) - 1;
1975
1976                 atomic_set(&context->tx_port_affinity,
1977                            atomic_add_return(
1978                                    1, &dev->port[port].roce.tx_port_affinity));
1979         }
1980
1981         return 0;
1982
1983 out_mdev:
1984         mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
1985 out_devx:
1986         if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX)
1987                 mlx5_ib_devx_destroy(dev, context->devx_uid);
1988
1989 out_uars:
1990         deallocate_uars(dev, context);
1991
1992 out_sys_pages:
1993         kfree(bfregi->sys_pages);
1994
1995 out_count:
1996         kfree(bfregi->count);
1997
1998 out_ctx:
1999         return err;
2000 }
2001
2002 static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
2003 {
2004         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
2005         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
2006         struct mlx5_bfreg_info *bfregi;
2007
2008         bfregi = &context->bfregi;
2009         mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
2010
2011         if (context->devx_uid)
2012                 mlx5_ib_devx_destroy(dev, context->devx_uid);
2013
2014         deallocate_uars(dev, context);
2015         kfree(bfregi->sys_pages);
2016         kfree(bfregi->count);
2017 }
2018
2019 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
2020                                  int uar_idx)
2021 {
2022         int fw_uars_per_page;
2023
2024         fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
2025
2026         return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
2027 }
2028
2029 static int get_command(unsigned long offset)
2030 {
2031         return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
2032 }
2033
2034 static int get_arg(unsigned long offset)
2035 {
2036         return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
2037 }
2038
2039 static int get_index(unsigned long offset)
2040 {
2041         return get_arg(offset);
2042 }
2043
2044 /* Index resides in an extra byte to enable larger values than 255 */
2045 static int get_extended_index(unsigned long offset)
2046 {
2047         return get_arg(offset) | ((offset >> 16) & 0xff) << 8;
2048 }
2049
2050
2051 static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
2052 {
2053 }
2054
2055 static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
2056 {
2057         switch (cmd) {
2058         case MLX5_IB_MMAP_WC_PAGE:
2059                 return "WC";
2060         case MLX5_IB_MMAP_REGULAR_PAGE:
2061                 return "best effort WC";
2062         case MLX5_IB_MMAP_NC_PAGE:
2063                 return "NC";
2064         case MLX5_IB_MMAP_DEVICE_MEM:
2065                 return "Device Memory";
2066         default:
2067                 return NULL;
2068         }
2069 }
2070
2071 static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
2072                                         struct vm_area_struct *vma,
2073                                         struct mlx5_ib_ucontext *context)
2074 {
2075         if ((vma->vm_end - vma->vm_start != PAGE_SIZE) ||
2076             !(vma->vm_flags & VM_SHARED))
2077                 return -EINVAL;
2078
2079         if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
2080                 return -EOPNOTSUPP;
2081
2082         if (vma->vm_flags & (VM_WRITE | VM_EXEC))
2083                 return -EPERM;
2084         vma->vm_flags &= ~VM_MAYWRITE;
2085
2086         if (!dev->mdev->clock_info)
2087                 return -EOPNOTSUPP;
2088
2089         return vm_insert_page(vma, vma->vm_start,
2090                               virt_to_page(dev->mdev->clock_info));
2091 }
2092
2093 static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
2094 {
2095         struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
2096         struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device);
2097         struct mlx5_var_table *var_table = &dev->var_table;
2098         struct mlx5_ib_dm *mdm;
2099
2100         switch (mentry->mmap_flag) {
2101         case MLX5_IB_MMAP_TYPE_MEMIC:
2102                 mdm = container_of(mentry, struct mlx5_ib_dm, mentry);
2103                 mlx5_cmd_dealloc_memic(&dev->dm, mdm->dev_addr,
2104                                        mdm->size);
2105                 kfree(mdm);
2106                 break;
2107         case MLX5_IB_MMAP_TYPE_VAR:
2108                 mutex_lock(&var_table->bitmap_lock);
2109                 clear_bit(mentry->page_idx, var_table->bitmap);
2110                 mutex_unlock(&var_table->bitmap_lock);
2111                 kfree(mentry);
2112                 break;
2113         default:
2114                 WARN_ON(true);
2115         }
2116 }
2117
2118 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
2119                     struct vm_area_struct *vma,
2120                     struct mlx5_ib_ucontext *context)
2121 {
2122         struct mlx5_bfreg_info *bfregi = &context->bfregi;
2123         int err;
2124         unsigned long idx;
2125         phys_addr_t pfn;
2126         pgprot_t prot;
2127         u32 bfreg_dyn_idx = 0;
2128         u32 uar_index;
2129         int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC);
2130         int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
2131                                 bfregi->num_static_sys_pages;
2132
2133         if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2134                 return -EINVAL;
2135
2136         if (dyn_uar)
2137                 idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages;
2138         else
2139                 idx = get_index(vma->vm_pgoff);
2140
2141         if (idx >= max_valid_idx) {
2142                 mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n",
2143                              idx, max_valid_idx);
2144                 return -EINVAL;
2145         }
2146
2147         switch (cmd) {
2148         case MLX5_IB_MMAP_WC_PAGE:
2149         case MLX5_IB_MMAP_ALLOC_WC:
2150 /* Some architectures don't support WC memory */
2151 #if defined(CONFIG_X86)
2152                 if (!pat_enabled())
2153                         return -EPERM;
2154 #elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
2155                         return -EPERM;
2156 #endif
2157         /* fall through */
2158         case MLX5_IB_MMAP_REGULAR_PAGE:
2159                 /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
2160                 prot = pgprot_writecombine(vma->vm_page_prot);
2161                 break;
2162         case MLX5_IB_MMAP_NC_PAGE:
2163                 prot = pgprot_noncached(vma->vm_page_prot);
2164                 break;
2165         default:
2166                 return -EINVAL;
2167         }
2168
2169         if (dyn_uar) {
2170                 int uars_per_page;
2171
2172                 uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
2173                 bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR);
2174                 if (bfreg_dyn_idx >= bfregi->total_num_bfregs) {
2175                         mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n",
2176                                      bfreg_dyn_idx, bfregi->total_num_bfregs);
2177                         return -EINVAL;
2178                 }
2179
2180                 mutex_lock(&bfregi->lock);
2181                 /* Fail if uar already allocated, first bfreg index of each
2182                  * page holds its count.
2183                  */
2184                 if (bfregi->count[bfreg_dyn_idx]) {
2185                         mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx);
2186                         mutex_unlock(&bfregi->lock);
2187                         return -EINVAL;
2188                 }
2189
2190                 bfregi->count[bfreg_dyn_idx]++;
2191                 mutex_unlock(&bfregi->lock);
2192
2193                 err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
2194                 if (err) {
2195                         mlx5_ib_warn(dev, "UAR alloc failed\n");
2196                         goto free_bfreg;
2197                 }
2198         } else {
2199                 uar_index = bfregi->sys_pages[idx];
2200         }
2201
2202         pfn = uar_index2pfn(dev, uar_index);
2203         mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
2204
2205         err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE,
2206                                 prot, NULL);
2207         if (err) {
2208                 mlx5_ib_err(dev,
2209                             "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n",
2210                             err, mmap_cmd2str(cmd));
2211                 goto err;
2212         }
2213
2214         if (dyn_uar)
2215                 bfregi->sys_pages[idx] = uar_index;
2216         return 0;
2217
2218 err:
2219         if (!dyn_uar)
2220                 return err;
2221
2222         mlx5_cmd_free_uar(dev->mdev, idx);
2223
2224 free_bfreg:
2225         mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx);
2226
2227         return err;
2228 }
2229
2230 static int add_dm_mmap_entry(struct ib_ucontext *context,
2231                              struct mlx5_ib_dm *mdm,
2232                              u64 address)
2233 {
2234         mdm->mentry.mmap_flag = MLX5_IB_MMAP_TYPE_MEMIC;
2235         mdm->mentry.address = address;
2236         return rdma_user_mmap_entry_insert_range(
2237                         context, &mdm->mentry.rdma_entry,
2238                         mdm->size,
2239                         MLX5_IB_MMAP_DEVICE_MEM << 16,
2240                         (MLX5_IB_MMAP_DEVICE_MEM << 16) + (1UL << 16) - 1);
2241 }
2242
2243 static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma)
2244 {
2245         unsigned long idx;
2246         u8 command;
2247
2248         command = get_command(vma->vm_pgoff);
2249         idx = get_extended_index(vma->vm_pgoff);
2250
2251         return (command << 16 | idx);
2252 }
2253
2254 static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev,
2255                                struct vm_area_struct *vma,
2256                                struct ib_ucontext *ucontext)
2257 {
2258         struct mlx5_user_mmap_entry *mentry;
2259         struct rdma_user_mmap_entry *entry;
2260         unsigned long pgoff;
2261         pgprot_t prot;
2262         phys_addr_t pfn;
2263         int ret;
2264
2265         pgoff = mlx5_vma_to_pgoff(vma);
2266         entry = rdma_user_mmap_entry_get_pgoff(ucontext, pgoff);
2267         if (!entry)
2268                 return -EINVAL;
2269
2270         mentry = to_mmmap(entry);
2271         pfn = (mentry->address >> PAGE_SHIFT);
2272         if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR)
2273                 prot = pgprot_noncached(vma->vm_page_prot);
2274         else
2275                 prot = pgprot_writecombine(vma->vm_page_prot);
2276         ret = rdma_user_mmap_io(ucontext, vma, pfn,
2277                                 entry->npages * PAGE_SIZE,
2278                                 prot,
2279                                 entry);
2280         rdma_user_mmap_entry_put(&mentry->rdma_entry);
2281         return ret;
2282 }
2283
2284 static u64 mlx5_entry_to_mmap_offset(struct mlx5_user_mmap_entry *entry)
2285 {
2286         u64 cmd = (entry->rdma_entry.start_pgoff >> 16) & 0xFFFF;
2287         u64 index = entry->rdma_entry.start_pgoff & 0xFFFF;
2288
2289         return (((index >> 8) << 16) | (cmd << MLX5_IB_MMAP_CMD_SHIFT) |
2290                 (index & 0xFF)) << PAGE_SHIFT;
2291 }
2292
2293 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
2294 {
2295         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
2296         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
2297         unsigned long command;
2298         phys_addr_t pfn;
2299
2300         command = get_command(vma->vm_pgoff);
2301         switch (command) {
2302         case MLX5_IB_MMAP_WC_PAGE:
2303         case MLX5_IB_MMAP_NC_PAGE:
2304         case MLX5_IB_MMAP_REGULAR_PAGE:
2305         case MLX5_IB_MMAP_ALLOC_WC:
2306                 return uar_mmap(dev, command, vma, context);
2307
2308         case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
2309                 return -ENOSYS;
2310
2311         case MLX5_IB_MMAP_CORE_CLOCK:
2312                 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2313                         return -EINVAL;
2314
2315                 if (vma->vm_flags & VM_WRITE)
2316                         return -EPERM;
2317                 vma->vm_flags &= ~VM_MAYWRITE;
2318
2319                 /* Don't expose to user-space information it shouldn't have */
2320                 if (PAGE_SIZE > 4096)
2321                         return -EOPNOTSUPP;
2322
2323                 pfn = (dev->mdev->iseg_base +
2324                        offsetof(struct mlx5_init_seg, internal_timer_h)) >>
2325                         PAGE_SHIFT;
2326                 return rdma_user_mmap_io(&context->ibucontext, vma, pfn,
2327                                          PAGE_SIZE,
2328                                          pgprot_noncached(vma->vm_page_prot),
2329                                          NULL);
2330         case MLX5_IB_MMAP_CLOCK_INFO:
2331                 return mlx5_ib_mmap_clock_info_page(dev, vma, context);
2332
2333         default:
2334                 return mlx5_ib_mmap_offset(dev, vma, ibcontext);
2335         }
2336
2337         return 0;
2338 }
2339
2340 static inline int check_dm_type_support(struct mlx5_ib_dev *dev,
2341                                         u32 type)
2342 {
2343         switch (type) {
2344         case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2345                 if (!MLX5_CAP_DEV_MEM(dev->mdev, memic))
2346                         return -EOPNOTSUPP;
2347                 break;
2348         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2349         case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2350                 if (!capable(CAP_SYS_RAWIO) ||
2351                     !capable(CAP_NET_RAW))
2352                         return -EPERM;
2353
2354                 if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) ||
2355                       MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner)))
2356                         return -EOPNOTSUPP;
2357                 break;
2358         }
2359
2360         return 0;
2361 }
2362
2363 static int handle_alloc_dm_memic(struct ib_ucontext *ctx,
2364                                  struct mlx5_ib_dm *dm,
2365                                  struct ib_dm_alloc_attr *attr,
2366                                  struct uverbs_attr_bundle *attrs)
2367 {
2368         struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
2369         u64 start_offset;
2370         u16 page_idx;
2371         int err;
2372         u64 address;
2373
2374         dm->size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
2375
2376         err = mlx5_cmd_alloc_memic(dm_db, &dm->dev_addr,
2377                                    dm->size, attr->alignment);
2378         if (err)
2379                 return err;
2380
2381         address = dm->dev_addr & PAGE_MASK;
2382         err = add_dm_mmap_entry(ctx, dm, address);
2383         if (err)
2384                 goto err_dealloc;
2385
2386         page_idx = dm->mentry.rdma_entry.start_pgoff & 0xFFFF;
2387         err = uverbs_copy_to(attrs,
2388                              MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
2389                              &page_idx,
2390                              sizeof(page_idx));
2391         if (err)
2392                 goto err_copy;
2393
2394         start_offset = dm->dev_addr & ~PAGE_MASK;
2395         err = uverbs_copy_to(attrs,
2396                              MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
2397                              &start_offset, sizeof(start_offset));
2398         if (err)
2399                 goto err_copy;
2400
2401         return 0;
2402
2403 err_copy:
2404         rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
2405 err_dealloc:
2406         mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
2407
2408         return err;
2409 }
2410
2411 static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
2412                                   struct mlx5_ib_dm *dm,
2413                                   struct ib_dm_alloc_attr *attr,
2414                                   struct uverbs_attr_bundle *attrs,
2415                                   int type)
2416 {
2417         struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
2418         u64 act_size;
2419         int err;
2420
2421         /* Allocation size must a multiple of the basic block size
2422          * and a power of 2.
2423          */
2424         act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev));
2425         act_size = roundup_pow_of_two(act_size);
2426
2427         dm->size = act_size;
2428         err = mlx5_dm_sw_icm_alloc(dev, type, act_size,
2429                                    to_mucontext(ctx)->devx_uid, &dm->dev_addr,
2430                                    &dm->icm_dm.obj_id);
2431         if (err)
2432                 return err;
2433
2434         err = uverbs_copy_to(attrs,
2435                              MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
2436                              &dm->dev_addr, sizeof(dm->dev_addr));
2437         if (err)
2438                 mlx5_dm_sw_icm_dealloc(dev, type, dm->size,
2439                                        to_mucontext(ctx)->devx_uid, dm->dev_addr,
2440                                        dm->icm_dm.obj_id);
2441
2442         return err;
2443 }
2444
2445 struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
2446                                struct ib_ucontext *context,
2447                                struct ib_dm_alloc_attr *attr,
2448                                struct uverbs_attr_bundle *attrs)
2449 {
2450         struct mlx5_ib_dm *dm;
2451         enum mlx5_ib_uapi_dm_type type;
2452         int err;
2453
2454         err = uverbs_get_const_default(&type, attrs,
2455                                        MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
2456                                        MLX5_IB_UAPI_DM_TYPE_MEMIC);
2457         if (err)
2458                 return ERR_PTR(err);
2459
2460         mlx5_ib_dbg(to_mdev(ibdev), "alloc_dm req: dm_type=%d user_length=0x%llx log_alignment=%d\n",
2461                     type, attr->length, attr->alignment);
2462
2463         err = check_dm_type_support(to_mdev(ibdev), type);
2464         if (err)
2465                 return ERR_PTR(err);
2466
2467         dm = kzalloc(sizeof(*dm), GFP_KERNEL);
2468         if (!dm)
2469                 return ERR_PTR(-ENOMEM);
2470
2471         dm->type = type;
2472
2473         switch (type) {
2474         case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2475                 err = handle_alloc_dm_memic(context, dm,
2476                                             attr,
2477                                             attrs);
2478                 break;
2479         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2480                 err = handle_alloc_dm_sw_icm(context, dm,
2481                                              attr, attrs,
2482                                              MLX5_SW_ICM_TYPE_STEERING);
2483                 break;
2484         case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2485                 err = handle_alloc_dm_sw_icm(context, dm,
2486                                              attr, attrs,
2487                                              MLX5_SW_ICM_TYPE_HEADER_MODIFY);
2488                 break;
2489         default:
2490                 err = -EOPNOTSUPP;
2491         }
2492
2493         if (err)
2494                 goto err_free;
2495
2496         return &dm->ibdm;
2497
2498 err_free:
2499         kfree(dm);
2500         return ERR_PTR(err);
2501 }
2502
2503 int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs)
2504 {
2505         struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
2506                 &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
2507         struct mlx5_core_dev *dev = to_mdev(ibdm->device)->mdev;
2508         struct mlx5_ib_dm *dm = to_mdm(ibdm);
2509         int ret;
2510
2511         switch (dm->type) {
2512         case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2513                 rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
2514                 return 0;
2515         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2516                 ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING,
2517                                              dm->size, ctx->devx_uid, dm->dev_addr,
2518                                              dm->icm_dm.obj_id);
2519                 if (ret)
2520                         return ret;
2521                 break;
2522         case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2523                 ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_HEADER_MODIFY,
2524                                              dm->size, ctx->devx_uid, dm->dev_addr,
2525                                              dm->icm_dm.obj_id);
2526                 if (ret)
2527                         return ret;
2528                 break;
2529         default:
2530                 return -EOPNOTSUPP;
2531         }
2532
2533         kfree(dm);
2534
2535         return 0;
2536 }
2537
2538 static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
2539 {
2540         struct mlx5_ib_pd *pd = to_mpd(ibpd);
2541         struct ib_device *ibdev = ibpd->device;
2542         struct mlx5_ib_alloc_pd_resp resp;
2543         int err;
2544         u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
2545         u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {};
2546         u16 uid = 0;
2547         struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
2548                 udata, struct mlx5_ib_ucontext, ibucontext);
2549
2550         uid = context ? context->devx_uid : 0;
2551         MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
2552         MLX5_SET(alloc_pd_in, in, uid, uid);
2553         err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in),
2554                             out, sizeof(out));
2555         if (err)
2556                 return err;
2557
2558         pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
2559         pd->uid = uid;
2560         if (udata) {
2561                 resp.pdn = pd->pdn;
2562                 if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
2563                         mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
2564                         return -EFAULT;
2565                 }
2566         }
2567
2568         return 0;
2569 }
2570
2571 static void mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
2572 {
2573         struct mlx5_ib_dev *mdev = to_mdev(pd->device);
2574         struct mlx5_ib_pd *mpd = to_mpd(pd);
2575
2576         mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
2577 }
2578
2579 enum {
2580         MATCH_CRITERIA_ENABLE_OUTER_BIT,
2581         MATCH_CRITERIA_ENABLE_MISC_BIT,
2582         MATCH_CRITERIA_ENABLE_INNER_BIT,
2583         MATCH_CRITERIA_ENABLE_MISC2_BIT
2584 };
2585
2586 #define HEADER_IS_ZERO(match_criteria, headers)                            \
2587         !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
2588                     0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
2589
2590 static u8 get_match_criteria_enable(u32 *match_criteria)
2591 {
2592         u8 match_criteria_enable;
2593
2594         match_criteria_enable =
2595                 (!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
2596                 MATCH_CRITERIA_ENABLE_OUTER_BIT;
2597         match_criteria_enable |=
2598                 (!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
2599                 MATCH_CRITERIA_ENABLE_MISC_BIT;
2600         match_criteria_enable |=
2601                 (!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
2602                 MATCH_CRITERIA_ENABLE_INNER_BIT;
2603         match_criteria_enable |=
2604                 (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) <<
2605                 MATCH_CRITERIA_ENABLE_MISC2_BIT;
2606
2607         return match_criteria_enable;
2608 }
2609
2610 static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
2611 {
2612         u8 entry_mask;
2613         u8 entry_val;
2614         int err = 0;
2615
2616         if (!mask)
2617                 goto out;
2618
2619         entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c,
2620                               ip_protocol);
2621         entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v,
2622                              ip_protocol);
2623         if (!entry_mask) {
2624                 MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
2625                 MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
2626                 goto out;
2627         }
2628         /* Don't override existing ip protocol */
2629         if (mask != entry_mask || val != entry_val)
2630                 err = -EINVAL;
2631 out:
2632         return err;
2633 }
2634
2635 static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
2636                            bool inner)
2637 {
2638         if (inner) {
2639                 MLX5_SET(fte_match_set_misc,
2640                          misc_c, inner_ipv6_flow_label, mask);
2641                 MLX5_SET(fte_match_set_misc,
2642                          misc_v, inner_ipv6_flow_label, val);
2643         } else {
2644                 MLX5_SET(fte_match_set_misc,
2645                          misc_c, outer_ipv6_flow_label, mask);
2646                 MLX5_SET(fte_match_set_misc,
2647                          misc_v, outer_ipv6_flow_label, val);
2648         }
2649 }
2650
2651 static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
2652 {
2653         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
2654         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
2655         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
2656         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
2657 }
2658
2659 static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask)
2660 {
2661         if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) &&
2662             !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL))
2663                 return -EOPNOTSUPP;
2664
2665         if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) &&
2666             !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP))
2667                 return -EOPNOTSUPP;
2668
2669         if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) &&
2670             !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS))
2671                 return -EOPNOTSUPP;
2672
2673         if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) &&
2674             !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL))
2675                 return -EOPNOTSUPP;
2676
2677         return 0;
2678 }
2679
2680 #define LAST_ETH_FIELD vlan_tag
2681 #define LAST_IB_FIELD sl
2682 #define LAST_IPV4_FIELD tos
2683 #define LAST_IPV6_FIELD traffic_class
2684 #define LAST_TCP_UDP_FIELD src_port
2685 #define LAST_TUNNEL_FIELD tunnel_id
2686 #define LAST_FLOW_TAG_FIELD tag_id
2687 #define LAST_DROP_FIELD size
2688 #define LAST_COUNTERS_FIELD counters
2689
2690 /* Field is the last supported field */
2691 #define FIELDS_NOT_SUPPORTED(filter, field)\
2692         memchr_inv((void *)&filter.field  +\
2693                    sizeof(filter.field), 0,\
2694                    sizeof(filter) -\
2695                    offsetof(typeof(filter), field) -\
2696                    sizeof(filter.field))
2697
2698 int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
2699                            bool is_egress,
2700                            struct mlx5_flow_act *action)
2701 {
2702
2703         switch (maction->ib_action.type) {
2704         case IB_FLOW_ACTION_ESP:
2705                 if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
2706                                       MLX5_FLOW_CONTEXT_ACTION_DECRYPT))
2707                         return -EINVAL;
2708                 /* Currently only AES_GCM keymat is supported by the driver */
2709                 action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx;
2710                 action->action |= is_egress ?
2711                         MLX5_FLOW_CONTEXT_ACTION_ENCRYPT :
2712                         MLX5_FLOW_CONTEXT_ACTION_DECRYPT;
2713                 return 0;
2714         case IB_FLOW_ACTION_UNSPECIFIED:
2715                 if (maction->flow_action_raw.sub_type ==
2716                     MLX5_IB_FLOW_ACTION_MODIFY_HEADER) {
2717                         if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
2718                                 return -EINVAL;
2719                         action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
2720                         action->modify_hdr =
2721                                 maction->flow_action_raw.modify_hdr;
2722                         return 0;
2723                 }
2724                 if (maction->flow_action_raw.sub_type ==
2725                     MLX5_IB_FLOW_ACTION_DECAP) {
2726                         if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
2727                                 return -EINVAL;
2728                         action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
2729                         return 0;
2730                 }
2731                 if (maction->flow_action_raw.sub_type ==
2732                     MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) {
2733                         if (action->action &
2734                             MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
2735                                 return -EINVAL;
2736                         action->action |=
2737                                 MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
2738                         action->pkt_reformat =
2739                                 maction->flow_action_raw.pkt_reformat;
2740                         return 0;
2741                 }
2742                 /* fall through */
2743         default:
2744                 return -EOPNOTSUPP;
2745         }
2746 }
2747
2748 static int parse_flow_attr(struct mlx5_core_dev *mdev,
2749                            struct mlx5_flow_spec *spec,
2750                            const union ib_flow_spec *ib_spec,
2751                            const struct ib_flow_attr *flow_attr,
2752                            struct mlx5_flow_act *action, u32 prev_type)
2753 {
2754         struct mlx5_flow_context *flow_context = &spec->flow_context;
2755         u32 *match_c = spec->match_criteria;
2756         u32 *match_v = spec->match_value;
2757         void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
2758                                            misc_parameters);
2759         void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
2760                                            misc_parameters);
2761         void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c,
2762                                             misc_parameters_2);
2763         void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v,
2764                                             misc_parameters_2);
2765         void *headers_c;
2766         void *headers_v;
2767         int match_ipv;
2768         int ret;
2769
2770         if (ib_spec->type & IB_FLOW_SPEC_INNER) {
2771                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
2772                                          inner_headers);
2773                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
2774                                          inner_headers);
2775                 match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2776                                         ft_field_support.inner_ip_version);
2777         } else {
2778                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
2779                                          outer_headers);
2780                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
2781                                          outer_headers);
2782                 match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2783                                         ft_field_support.outer_ip_version);
2784         }
2785
2786         switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
2787         case IB_FLOW_SPEC_ETH:
2788                 if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
2789                         return -EOPNOTSUPP;
2790
2791                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2792                                              dmac_47_16),
2793                                 ib_spec->eth.mask.dst_mac);
2794                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2795                                              dmac_47_16),
2796                                 ib_spec->eth.val.dst_mac);
2797
2798                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2799                                              smac_47_16),
2800                                 ib_spec->eth.mask.src_mac);
2801                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2802                                              smac_47_16),
2803                                 ib_spec->eth.val.src_mac);
2804
2805                 if (ib_spec->eth.mask.vlan_tag) {
2806                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2807                                  cvlan_tag, 1);
2808                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2809                                  cvlan_tag, 1);
2810
2811                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2812                                  first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
2813                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2814                                  first_vid, ntohs(ib_spec->eth.val.vlan_tag));
2815
2816                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2817                                  first_cfi,
2818                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
2819                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2820                                  first_cfi,
2821                                  ntohs(ib_spec->eth.val.vlan_tag) >> 12);
2822
2823                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2824                                  first_prio,
2825                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
2826                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2827                                  first_prio,
2828                                  ntohs(ib_spec->eth.val.vlan_tag) >> 13);
2829                 }
2830                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2831                          ethertype, ntohs(ib_spec->eth.mask.ether_type));
2832                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2833                          ethertype, ntohs(ib_spec->eth.val.ether_type));
2834                 break;
2835         case IB_FLOW_SPEC_IPV4:
2836                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
2837                         return -EOPNOTSUPP;
2838
2839                 if (match_ipv) {
2840                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2841                                  ip_version, 0xf);
2842                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2843                                  ip_version, MLX5_FS_IPV4_VERSION);
2844                 } else {
2845                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2846                                  ethertype, 0xffff);
2847                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2848                                  ethertype, ETH_P_IP);
2849                 }
2850
2851                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2852                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
2853                        &ib_spec->ipv4.mask.src_ip,
2854                        sizeof(ib_spec->ipv4.mask.src_ip));
2855                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2856                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
2857                        &ib_spec->ipv4.val.src_ip,
2858                        sizeof(ib_spec->ipv4.val.src_ip));
2859                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2860                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
2861                        &ib_spec->ipv4.mask.dst_ip,
2862                        sizeof(ib_spec->ipv4.mask.dst_ip));
2863                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2864                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
2865                        &ib_spec->ipv4.val.dst_ip,
2866                        sizeof(ib_spec->ipv4.val.dst_ip));
2867
2868                 set_tos(headers_c, headers_v,
2869                         ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
2870
2871                 if (set_proto(headers_c, headers_v,
2872                               ib_spec->ipv4.mask.proto,
2873                               ib_spec->ipv4.val.proto))
2874                         return -EINVAL;
2875                 break;
2876         case IB_FLOW_SPEC_IPV6:
2877                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
2878                         return -EOPNOTSUPP;
2879
2880                 if (match_ipv) {
2881                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2882                                  ip_version, 0xf);
2883                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2884                                  ip_version, MLX5_FS_IPV6_VERSION);
2885                 } else {
2886                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2887                                  ethertype, 0xffff);
2888                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2889                                  ethertype, ETH_P_IPV6);
2890                 }
2891
2892                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2893                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
2894                        &ib_spec->ipv6.mask.src_ip,
2895                        sizeof(ib_spec->ipv6.mask.src_ip));
2896                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2897                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
2898                        &ib_spec->ipv6.val.src_ip,
2899                        sizeof(ib_spec->ipv6.val.src_ip));
2900                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2901                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
2902                        &ib_spec->ipv6.mask.dst_ip,
2903                        sizeof(ib_spec->ipv6.mask.dst_ip));
2904                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2905                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
2906                        &ib_spec->ipv6.val.dst_ip,
2907                        sizeof(ib_spec->ipv6.val.dst_ip));
2908
2909                 set_tos(headers_c, headers_v,
2910                         ib_spec->ipv6.mask.traffic_class,
2911                         ib_spec->ipv6.val.traffic_class);
2912
2913                 if (set_proto(headers_c, headers_v,
2914                               ib_spec->ipv6.mask.next_hdr,
2915                               ib_spec->ipv6.val.next_hdr))
2916                         return -EINVAL;
2917
2918                 set_flow_label(misc_params_c, misc_params_v,
2919                                ntohl(ib_spec->ipv6.mask.flow_label),
2920                                ntohl(ib_spec->ipv6.val.flow_label),
2921                                ib_spec->type & IB_FLOW_SPEC_INNER);
2922                 break;
2923         case IB_FLOW_SPEC_ESP:
2924                 if (ib_spec->esp.mask.seq)
2925                         return -EOPNOTSUPP;
2926
2927                 MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi,
2928                          ntohl(ib_spec->esp.mask.spi));
2929                 MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi,
2930                          ntohl(ib_spec->esp.val.spi));
2931                 break;
2932         case IB_FLOW_SPEC_TCP:
2933                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
2934                                          LAST_TCP_UDP_FIELD))
2935                         return -EOPNOTSUPP;
2936
2937                 if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP))
2938                         return -EINVAL;
2939
2940                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
2941                          ntohs(ib_spec->tcp_udp.mask.src_port));
2942                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
2943                          ntohs(ib_spec->tcp_udp.val.src_port));
2944
2945                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
2946                          ntohs(ib_spec->tcp_udp.mask.dst_port));
2947                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
2948                          ntohs(ib_spec->tcp_udp.val.dst_port));
2949                 break;
2950         case IB_FLOW_SPEC_UDP:
2951                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
2952                                          LAST_TCP_UDP_FIELD))
2953                         return -EOPNOTSUPP;
2954
2955                 if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP))
2956                         return -EINVAL;
2957
2958                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
2959                          ntohs(ib_spec->tcp_udp.mask.src_port));
2960                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
2961                          ntohs(ib_spec->tcp_udp.val.src_port));
2962
2963                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
2964                          ntohs(ib_spec->tcp_udp.mask.dst_port));
2965                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
2966                          ntohs(ib_spec->tcp_udp.val.dst_port));
2967                 break;
2968         case IB_FLOW_SPEC_GRE:
2969                 if (ib_spec->gre.mask.c_ks_res0_ver)
2970                         return -EOPNOTSUPP;
2971
2972                 if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE))
2973                         return -EINVAL;
2974
2975                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
2976                          0xff);
2977                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
2978                          IPPROTO_GRE);
2979
2980                 MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol,
2981                          ntohs(ib_spec->gre.mask.protocol));
2982                 MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol,
2983                          ntohs(ib_spec->gre.val.protocol));
2984
2985                 memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
2986                                     gre_key.nvgre.hi),
2987                        &ib_spec->gre.mask.key,
2988                        sizeof(ib_spec->gre.mask.key));
2989                 memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v,
2990                                     gre_key.nvgre.hi),
2991                        &ib_spec->gre.val.key,
2992                        sizeof(ib_spec->gre.val.key));
2993                 break;
2994         case IB_FLOW_SPEC_MPLS:
2995                 switch (prev_type) {
2996                 case IB_FLOW_SPEC_UDP:
2997                         if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2998                                                    ft_field_support.outer_first_mpls_over_udp),
2999                                                    &ib_spec->mpls.mask.tag))
3000                                 return -EOPNOTSUPP;
3001
3002                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
3003                                             outer_first_mpls_over_udp),
3004                                &ib_spec->mpls.val.tag,
3005                                sizeof(ib_spec->mpls.val.tag));
3006                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
3007                                             outer_first_mpls_over_udp),
3008                                &ib_spec->mpls.mask.tag,
3009                                sizeof(ib_spec->mpls.mask.tag));
3010                         break;
3011                 case IB_FLOW_SPEC_GRE:
3012                         if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3013                                                    ft_field_support.outer_first_mpls_over_gre),
3014                                                    &ib_spec->mpls.mask.tag))
3015                                 return -EOPNOTSUPP;
3016
3017                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
3018                                             outer_first_mpls_over_gre),
3019                                &ib_spec->mpls.val.tag,
3020                                sizeof(ib_spec->mpls.val.tag));
3021                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
3022                                             outer_first_mpls_over_gre),
3023                                &ib_spec->mpls.mask.tag,
3024                                sizeof(ib_spec->mpls.mask.tag));
3025                         break;
3026                 default:
3027                         if (ib_spec->type & IB_FLOW_SPEC_INNER) {
3028                                 if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3029                                                            ft_field_support.inner_first_mpls),
3030                                                            &ib_spec->mpls.mask.tag))
3031                                         return -EOPNOTSUPP;
3032
3033                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
3034                                                     inner_first_mpls),
3035                                        &ib_spec->mpls.val.tag,
3036                                        sizeof(ib_spec->mpls.val.tag));
3037                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
3038                                                     inner_first_mpls),
3039                                        &ib_spec->mpls.mask.tag,
3040                                        sizeof(ib_spec->mpls.mask.tag));
3041                         } else {
3042                                 if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3043                                                            ft_field_support.outer_first_mpls),
3044                                                            &ib_spec->mpls.mask.tag))
3045                                         return -EOPNOTSUPP;
3046
3047                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
3048                                                     outer_first_mpls),
3049                                        &ib_spec->mpls.val.tag,
3050                                        sizeof(ib_spec->mpls.val.tag));
3051                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
3052                                                     outer_first_mpls),
3053                                        &ib_spec->mpls.mask.tag,
3054                                        sizeof(ib_spec->mpls.mask.tag));
3055                         }
3056                 }
3057                 break;
3058         case IB_FLOW_SPEC_VXLAN_TUNNEL:
3059                 if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
3060                                          LAST_TUNNEL_FIELD))
3061                         return -EOPNOTSUPP;
3062
3063                 MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
3064                          ntohl(ib_spec->tunnel.mask.tunnel_id));
3065                 MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
3066                          ntohl(ib_spec->tunnel.val.tunnel_id));
3067                 break;
3068         case IB_FLOW_SPEC_ACTION_TAG:
3069                 if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
3070                                          LAST_FLOW_TAG_FIELD))
3071                         return -EOPNOTSUPP;
3072                 if (ib_spec->flow_tag.tag_id >= BIT(24))
3073                         return -EINVAL;
3074
3075                 flow_context->flow_tag = ib_spec->flow_tag.tag_id;
3076                 flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
3077                 break;
3078         case IB_FLOW_SPEC_ACTION_DROP:
3079                 if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
3080                                          LAST_DROP_FIELD))
3081                         return -EOPNOTSUPP;
3082                 action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
3083                 break;
3084         case IB_FLOW_SPEC_ACTION_HANDLE:
3085                 ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act),
3086                         flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action);
3087                 if (ret)
3088                         return ret;
3089                 break;
3090         case IB_FLOW_SPEC_ACTION_COUNT:
3091                 if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count,
3092                                          LAST_COUNTERS_FIELD))
3093                         return -EOPNOTSUPP;
3094
3095                 /* for now support only one counters spec per flow */
3096                 if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
3097                         return -EINVAL;
3098
3099                 action->counters = ib_spec->flow_count.counters;
3100                 action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
3101                 break;
3102         default:
3103                 return -EINVAL;
3104         }
3105
3106         return 0;
3107 }
3108
3109 /* If a flow could catch both multicast and unicast packets,
3110  * it won't fall into the multicast flow steering table and this rule
3111  * could steal other multicast packets.
3112  */
3113 static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr)
3114 {
3115         union ib_flow_spec *flow_spec;
3116
3117         if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
3118             ib_attr->num_of_specs < 1)
3119                 return false;
3120
3121         flow_spec = (union ib_flow_spec *)(ib_attr + 1);
3122         if (flow_spec->type == IB_FLOW_SPEC_IPV4) {
3123                 struct ib_flow_spec_ipv4 *ipv4_spec;
3124
3125                 ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec;
3126                 if (ipv4_is_multicast(ipv4_spec->val.dst_ip))
3127                         return true;
3128
3129                 return false;
3130         }
3131
3132         if (flow_spec->type == IB_FLOW_SPEC_ETH) {
3133                 struct ib_flow_spec_eth *eth_spec;
3134
3135                 eth_spec = (struct ib_flow_spec_eth *)flow_spec;
3136                 return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
3137                        is_multicast_ether_addr(eth_spec->val.dst_mac);
3138         }
3139
3140         return false;
3141 }
3142
3143 enum valid_spec {
3144         VALID_SPEC_INVALID,
3145         VALID_SPEC_VALID,
3146         VALID_SPEC_NA,
3147 };
3148
3149 static enum valid_spec
3150 is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
3151                      const struct mlx5_flow_spec *spec,
3152                      const struct mlx5_flow_act *flow_act,
3153                      bool egress)
3154 {
3155         const u32 *match_c = spec->match_criteria;
3156         bool is_crypto =
3157                 (flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
3158                                      MLX5_FLOW_CONTEXT_ACTION_DECRYPT));
3159         bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c);
3160         bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP;
3161
3162         /*
3163          * Currently only crypto is supported in egress, when regular egress
3164          * rules would be supported, always return VALID_SPEC_NA.
3165          */
3166         if (!is_crypto)
3167                 return VALID_SPEC_NA;
3168
3169         return is_crypto && is_ipsec &&
3170                 (!egress || (!is_drop &&
3171                              !(spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG))) ?
3172                 VALID_SPEC_VALID : VALID_SPEC_INVALID;
3173 }
3174
3175 static bool is_valid_spec(struct mlx5_core_dev *mdev,
3176                           const struct mlx5_flow_spec *spec,
3177                           const struct mlx5_flow_act *flow_act,
3178                           bool egress)
3179 {
3180         /* We curretly only support ipsec egress flow */
3181         return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID;
3182 }
3183
3184 static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
3185                                const struct ib_flow_attr *flow_attr,
3186                                bool check_inner)
3187 {
3188         union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
3189         int match_ipv = check_inner ?
3190                         MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3191                                         ft_field_support.inner_ip_version) :
3192                         MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3193                                         ft_field_support.outer_ip_version);
3194         int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0;
3195         bool ipv4_spec_valid, ipv6_spec_valid;
3196         unsigned int ip_spec_type = 0;
3197         bool has_ethertype = false;
3198         unsigned int spec_index;
3199         bool mask_valid = true;
3200         u16 eth_type = 0;
3201         bool type_valid;
3202
3203         /* Validate that ethertype is correct */
3204         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
3205                 if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) &&
3206                     ib_spec->eth.mask.ether_type) {
3207                         mask_valid = (ib_spec->eth.mask.ether_type ==
3208                                       htons(0xffff));
3209                         has_ethertype = true;
3210                         eth_type = ntohs(ib_spec->eth.val.ether_type);
3211                 } else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) ||
3212                            (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) {
3213                         ip_spec_type = ib_spec->type;
3214                 }
3215                 ib_spec = (void *)ib_spec + ib_spec->size;
3216         }
3217
3218         type_valid = (!has_ethertype) || (!ip_spec_type);
3219         if (!type_valid && mask_valid) {
3220                 ipv4_spec_valid = (eth_type == ETH_P_IP) &&
3221                         (ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit));
3222                 ipv6_spec_valid = (eth_type == ETH_P_IPV6) &&
3223                         (ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit));
3224
3225                 type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) ||
3226                              (((eth_type == ETH_P_MPLS_UC) ||
3227                                (eth_type == ETH_P_MPLS_MC)) && match_ipv);
3228         }
3229
3230         return type_valid;
3231 }
3232
3233 static bool is_valid_attr(struct mlx5_core_dev *mdev,
3234                           const struct ib_flow_attr *flow_attr)
3235 {
3236         return is_valid_ethertype(mdev, flow_attr, false) &&
3237                is_valid_ethertype(mdev, flow_attr, true);
3238 }
3239
3240 static void put_flow_table(struct mlx5_ib_dev *dev,
3241                            struct mlx5_ib_flow_prio *prio, bool ft_added)
3242 {
3243         prio->refcount -= !!ft_added;
3244         if (!prio->refcount) {
3245                 mlx5_destroy_flow_table(prio->flow_table);
3246                 prio->flow_table = NULL;
3247         }
3248 }
3249
3250 static void counters_clear_description(struct ib_counters *counters)
3251 {
3252         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
3253
3254         mutex_lock(&mcounters->mcntrs_mutex);
3255         kfree(mcounters->counters_data);
3256         mcounters->counters_data = NULL;
3257         mcounters->cntrs_max_index = 0;
3258         mutex_unlock(&mcounters->mcntrs_mutex);
3259 }
3260
3261 static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
3262 {
3263         struct mlx5_ib_flow_handler *handler = container_of(flow_id,
3264                                                           struct mlx5_ib_flow_handler,
3265                                                           ibflow);
3266         struct mlx5_ib_flow_handler *iter, *tmp;
3267         struct mlx5_ib_dev *dev = handler->dev;
3268
3269         mutex_lock(&dev->flow_db->lock);
3270
3271         list_for_each_entry_safe(iter, tmp, &handler->list, list) {
3272                 mlx5_del_flow_rules(iter->rule);
3273                 put_flow_table(dev, iter->prio, true);
3274                 list_del(&iter->list);
3275                 kfree(iter);
3276         }
3277
3278         mlx5_del_flow_rules(handler->rule);
3279         put_flow_table(dev, handler->prio, true);
3280         if (handler->ibcounters &&
3281             atomic_read(&handler->ibcounters->usecnt) == 1)
3282                 counters_clear_description(handler->ibcounters);
3283
3284         mutex_unlock(&dev->flow_db->lock);
3285         if (handler->flow_matcher)
3286                 atomic_dec(&handler->flow_matcher->usecnt);
3287         kfree(handler);
3288
3289         return 0;
3290 }
3291
3292 static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
3293 {
3294         priority *= 2;
3295         if (!dont_trap)
3296                 priority++;
3297         return priority;
3298 }
3299
3300 enum flow_table_type {
3301         MLX5_IB_FT_RX,
3302         MLX5_IB_FT_TX
3303 };
3304
3305 #define MLX5_FS_MAX_TYPES        6
3306 #define MLX5_FS_MAX_ENTRIES      BIT(16)
3307
3308 static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
3309                                            struct mlx5_ib_flow_prio *prio,
3310                                            int priority,
3311                                            int num_entries, int num_groups,
3312                                            u32 flags)
3313 {
3314         struct mlx5_flow_table_attr ft_attr = {};
3315         struct mlx5_flow_table *ft;
3316
3317         ft_attr.prio = priority;
3318         ft_attr.max_fte = num_entries;
3319         ft_attr.flags = flags;
3320         ft_attr.autogroup.max_num_groups = num_groups;
3321         ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
3322         if (IS_ERR(ft))
3323                 return ERR_CAST(ft);
3324
3325         prio->flow_table = ft;
3326         prio->refcount = 0;
3327         return prio;
3328 }
3329
3330 static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
3331                                                 struct ib_flow_attr *flow_attr,
3332                                                 enum flow_table_type ft_type)
3333 {
3334         bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
3335         struct mlx5_flow_namespace *ns = NULL;
3336         struct mlx5_ib_flow_prio *prio;
3337         struct mlx5_flow_table *ft;
3338         int max_table_size;
3339         int num_entries;
3340         int num_groups;
3341         bool esw_encap;
3342         u32 flags = 0;
3343         int priority;
3344
3345         max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3346                                                        log_max_ft_size));
3347         esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
3348                 DEVLINK_ESWITCH_ENCAP_MODE_NONE;
3349         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
3350                 enum mlx5_flow_namespace_type fn_type;
3351
3352                 if (flow_is_multicast_only(flow_attr) &&
3353                     !dont_trap)
3354                         priority = MLX5_IB_FLOW_MCAST_PRIO;
3355                 else
3356                         priority = ib_prio_to_core_prio(flow_attr->priority,
3357                                                         dont_trap);
3358                 if (ft_type == MLX5_IB_FT_RX) {
3359                         fn_type = MLX5_FLOW_NAMESPACE_BYPASS;
3360                         prio = &dev->flow_db->prios[priority];
3361                         if (!dev->is_rep && !esw_encap &&
3362                             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
3363                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
3364                         if (!dev->is_rep && !esw_encap &&
3365                             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3366                                         reformat_l3_tunnel_to_l2))
3367                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3368                 } else {
3369                         max_table_size =
3370                                 BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
3371                                                               log_max_ft_size));
3372                         fn_type = MLX5_FLOW_NAMESPACE_EGRESS;
3373                         prio = &dev->flow_db->egress_prios[priority];
3374                         if (!dev->is_rep && !esw_encap &&
3375                             MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
3376                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3377                 }
3378                 ns = mlx5_get_flow_namespace(dev->mdev, fn_type);
3379                 num_entries = MLX5_FS_MAX_ENTRIES;
3380                 num_groups = MLX5_FS_MAX_TYPES;
3381         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3382                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
3383                 ns = mlx5_get_flow_namespace(dev->mdev,
3384                                              MLX5_FLOW_NAMESPACE_LEFTOVERS);
3385                 build_leftovers_ft_param(&priority,
3386                                          &num_entries,
3387                                          &num_groups);
3388                 prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
3389         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3390                 if (!MLX5_CAP_FLOWTABLE(dev->mdev,
3391                                         allow_sniffer_and_nic_rx_shared_tir))
3392                         return ERR_PTR(-ENOTSUPP);
3393
3394                 ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
3395                                              MLX5_FLOW_NAMESPACE_SNIFFER_RX :
3396                                              MLX5_FLOW_NAMESPACE_SNIFFER_TX);
3397
3398                 prio = &dev->flow_db->sniffer[ft_type];
3399                 priority = 0;
3400                 num_entries = 1;
3401                 num_groups = 1;
3402         }
3403
3404         if (!ns)
3405                 return ERR_PTR(-ENOTSUPP);
3406
3407         max_table_size = min_t(int, num_entries, max_table_size);
3408
3409         ft = prio->flow_table;
3410         if (!ft)
3411                 return _get_prio(ns, prio, priority, max_table_size, num_groups,
3412                                  flags);
3413
3414         return prio;
3415 }
3416
3417 static void set_underlay_qp(struct mlx5_ib_dev *dev,
3418                             struct mlx5_flow_spec *spec,
3419                             u32 underlay_qpn)
3420 {
3421         void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
3422                                            spec->match_criteria,
3423                                            misc_parameters);
3424         void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
3425                                            misc_parameters);
3426
3427         if (underlay_qpn &&
3428             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3429                                       ft_field_support.bth_dst_qp)) {
3430                 MLX5_SET(fte_match_set_misc,
3431                          misc_params_v, bth_dst_qp, underlay_qpn);
3432                 MLX5_SET(fte_match_set_misc,
3433                          misc_params_c, bth_dst_qp, 0xffffff);
3434         }
3435 }
3436
3437 static int read_flow_counters(struct ib_device *ibdev,
3438                               struct mlx5_read_counters_attr *read_attr)
3439 {
3440         struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
3441         struct mlx5_ib_dev *dev = to_mdev(ibdev);
3442
3443         return mlx5_fc_query(dev->mdev, fc,
3444                              &read_attr->out[IB_COUNTER_PACKETS],
3445                              &read_attr->out[IB_COUNTER_BYTES]);
3446 }
3447
3448 /* flow counters currently expose two counters packets and bytes */
3449 #define FLOW_COUNTERS_NUM 2
3450 static int counters_set_description(struct ib_counters *counters,
3451                                     enum mlx5_ib_counters_type counters_type,
3452                                     struct mlx5_ib_flow_counters_desc *desc_data,
3453                                     u32 ncounters)
3454 {
3455         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
3456         u32 cntrs_max_index = 0;
3457         int i;
3458
3459         if (counters_type != MLX5_IB_COUNTERS_FLOW)
3460                 return -EINVAL;
3461
3462         /* init the fields for the object */
3463         mcounters->type = counters_type;
3464         mcounters->read_counters = read_flow_counters;
3465         mcounters->counters_num = FLOW_COUNTERS_NUM;
3466         mcounters->ncounters = ncounters;
3467         /* each counter entry have both description and index pair */
3468         for (i = 0; i < ncounters; i++) {
3469                 if (desc_data[i].description > IB_COUNTER_BYTES)
3470                         return -EINVAL;
3471
3472                 if (cntrs_max_index <= desc_data[i].index)
3473                         cntrs_max_index = desc_data[i].index + 1;
3474         }
3475
3476         mutex_lock(&mcounters->mcntrs_mutex);
3477         mcounters->counters_data = desc_data;
3478         mcounters->cntrs_max_index = cntrs_max_index;
3479         mutex_unlock(&mcounters->mcntrs_mutex);
3480
3481         return 0;
3482 }
3483
3484 #define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
3485 static int flow_counters_set_data(struct ib_counters *ibcounters,
3486                                   struct mlx5_ib_create_flow *ucmd)
3487 {
3488         struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
3489         struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
3490         struct mlx5_ib_flow_counters_desc *desc_data = NULL;
3491         bool hw_hndl = false;
3492         int ret = 0;
3493
3494         if (ucmd && ucmd->ncounters_data != 0) {
3495                 cntrs_data = ucmd->data;
3496                 if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
3497                         return -EINVAL;
3498
3499                 desc_data = kcalloc(cntrs_data->ncounters,
3500                                     sizeof(*desc_data),
3501                                     GFP_KERNEL);
3502                 if (!desc_data)
3503                         return  -ENOMEM;
3504
3505                 if (copy_from_user(desc_data,
3506                                    u64_to_user_ptr(cntrs_data->counters_data),
3507                                    sizeof(*desc_data) * cntrs_data->ncounters)) {
3508                         ret = -EFAULT;
3509                         goto free;
3510                 }
3511         }
3512
3513         if (!mcounters->hw_cntrs_hndl) {
3514                 mcounters->hw_cntrs_hndl = mlx5_fc_create(
3515                         to_mdev(ibcounters->device)->mdev, false);
3516                 if (IS_ERR(mcounters->hw_cntrs_hndl)) {
3517                         ret = PTR_ERR(mcounters->hw_cntrs_hndl);
3518                         goto free;
3519                 }
3520                 hw_hndl = true;
3521         }
3522
3523         if (desc_data) {
3524                 /* counters already bound to at least one flow */
3525                 if (mcounters->cntrs_max_index) {
3526                         ret = -EINVAL;
3527                         goto free_hndl;
3528                 }
3529
3530                 ret = counters_set_description(ibcounters,
3531                                                MLX5_IB_COUNTERS_FLOW,
3532                                                desc_data,
3533                                                cntrs_data->ncounters);
3534                 if (ret)
3535                         goto free_hndl;
3536
3537         } else if (!mcounters->cntrs_max_index) {
3538                 /* counters not bound yet, must have udata passed */
3539                 ret = -EINVAL;
3540                 goto free_hndl;
3541         }
3542
3543         return 0;
3544
3545 free_hndl:
3546         if (hw_hndl) {
3547                 mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev,
3548                                 mcounters->hw_cntrs_hndl);
3549                 mcounters->hw_cntrs_hndl = NULL;
3550         }
3551 free:
3552         kfree(desc_data);
3553         return ret;
3554 }
3555
3556 static void mlx5_ib_set_rule_source_port(struct mlx5_ib_dev *dev,
3557                                          struct mlx5_flow_spec *spec,
3558                                          struct mlx5_eswitch_rep *rep)
3559 {
3560         struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
3561         void *misc;
3562
3563         if (mlx5_eswitch_vport_match_metadata_enabled(esw)) {
3564                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
3565                                     misc_parameters_2);
3566
3567                 MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
3568                          mlx5_eswitch_get_vport_metadata_for_match(esw,
3569                                                                    rep->vport));
3570                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
3571                                     misc_parameters_2);
3572
3573                 MLX5_SET_TO_ONES(fte_match_set_misc2, misc, metadata_reg_c_0);
3574         } else {
3575                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
3576                                     misc_parameters);
3577
3578                 MLX5_SET(fte_match_set_misc, misc, source_port, rep->vport);
3579
3580                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
3581                                     misc_parameters);
3582
3583                 MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
3584         }
3585 }
3586
3587 static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
3588                                                       struct mlx5_ib_flow_prio *ft_prio,
3589                                                       const struct ib_flow_attr *flow_attr,
3590                                                       struct mlx5_flow_destination *dst,
3591                                                       u32 underlay_qpn,
3592                                                       struct mlx5_ib_create_flow *ucmd)
3593 {
3594         struct mlx5_flow_table  *ft = ft_prio->flow_table;
3595         struct mlx5_ib_flow_handler *handler;
3596         struct mlx5_flow_act flow_act = {};
3597         struct mlx5_flow_spec *spec;
3598         struct mlx5_flow_destination dest_arr[2] = {};
3599         struct mlx5_flow_destination *rule_dst = dest_arr;
3600         const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
3601         unsigned int spec_index;
3602         u32 prev_type = 0;
3603         int err = 0;
3604         int dest_num = 0;
3605         bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
3606
3607         if (!is_valid_attr(dev->mdev, flow_attr))
3608                 return ERR_PTR(-EINVAL);
3609
3610         if (dev->is_rep && is_egress)
3611                 return ERR_PTR(-EINVAL);
3612
3613         spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
3614         handler = kzalloc(sizeof(*handler), GFP_KERNEL);
3615         if (!handler || !spec) {
3616                 err = -ENOMEM;
3617                 goto free;
3618         }
3619
3620         INIT_LIST_HEAD(&handler->list);
3621
3622         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
3623                 err = parse_flow_attr(dev->mdev, spec,
3624                                       ib_flow, flow_attr, &flow_act,
3625                                       prev_type);
3626                 if (err < 0)
3627                         goto free;
3628
3629                 prev_type = ((union ib_flow_spec *)ib_flow)->type;
3630                 ib_flow += ((union ib_flow_spec *)ib_flow)->size;
3631         }
3632
3633         if (dst && !(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP)) {
3634                 memcpy(&dest_arr[0], dst, sizeof(*dst));
3635                 dest_num++;
3636         }
3637
3638         if (!flow_is_multicast_only(flow_attr))
3639                 set_underlay_qp(dev, spec, underlay_qpn);
3640
3641         if (dev->is_rep) {
3642                 struct mlx5_eswitch_rep *rep;
3643
3644                 rep = dev->port[flow_attr->port - 1].rep;
3645                 if (!rep) {
3646                         err = -EINVAL;
3647                         goto free;
3648                 }
3649
3650                 mlx5_ib_set_rule_source_port(dev, spec, rep);
3651         }
3652
3653         spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
3654
3655         if (is_egress &&
3656             !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) {
3657                 err = -EINVAL;
3658                 goto free;
3659         }
3660
3661         if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
3662                 struct mlx5_ib_mcounters *mcounters;
3663
3664                 err = flow_counters_set_data(flow_act.counters, ucmd);
3665                 if (err)
3666                         goto free;
3667
3668                 mcounters = to_mcounters(flow_act.counters);
3669                 handler->ibcounters = flow_act.counters;
3670                 dest_arr[dest_num].type =
3671                         MLX5_FLOW_DESTINATION_TYPE_COUNTER;
3672                 dest_arr[dest_num].counter_id =
3673                         mlx5_fc_id(mcounters->hw_cntrs_hndl);
3674                 dest_num++;
3675         }
3676
3677         if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
3678                 if (!dest_num)
3679                         rule_dst = NULL;
3680         } else {
3681                 if (is_egress)
3682                         flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
3683                 else
3684                         flow_act.action |=
3685                                 dest_num ?  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
3686                                         MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
3687         }
3688
3689         if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG)  &&
3690             (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3691              flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
3692                 mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
3693                              spec->flow_context.flow_tag, flow_attr->type);
3694                 err = -EINVAL;
3695                 goto free;
3696         }
3697         handler->rule = mlx5_add_flow_rules(ft, spec,
3698                                             &flow_act,
3699                                             rule_dst, dest_num);
3700
3701         if (IS_ERR(handler->rule)) {
3702                 err = PTR_ERR(handler->rule);
3703                 goto free;
3704         }
3705
3706         ft_prio->refcount++;
3707         handler->prio = ft_prio;
3708         handler->dev = dev;
3709
3710         ft_prio->flow_table = ft;
3711 free:
3712         if (err && handler) {
3713                 if (handler->ibcounters &&
3714                     atomic_read(&handler->ibcounters->usecnt) == 1)
3715                         counters_clear_description(handler->ibcounters);
3716                 kfree(handler);
3717         }
3718         kvfree(spec);
3719         return err ? ERR_PTR(err) : handler;
3720 }
3721
3722 static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
3723                                                      struct mlx5_ib_flow_prio *ft_prio,
3724                                                      const struct ib_flow_attr *flow_attr,
3725                                                      struct mlx5_flow_destination *dst)
3726 {
3727         return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
3728 }
3729
3730 static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
3731                                                           struct mlx5_ib_flow_prio *ft_prio,
3732                                                           struct ib_flow_attr *flow_attr,
3733                                                           struct mlx5_flow_destination *dst)
3734 {
3735         struct mlx5_ib_flow_handler *handler_dst = NULL;
3736         struct mlx5_ib_flow_handler *handler = NULL;
3737
3738         handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
3739         if (!IS_ERR(handler)) {
3740                 handler_dst = create_flow_rule(dev, ft_prio,
3741                                                flow_attr, dst);
3742                 if (IS_ERR(handler_dst)) {
3743                         mlx5_del_flow_rules(handler->rule);
3744                         ft_prio->refcount--;
3745                         kfree(handler);
3746                         handler = handler_dst;
3747                 } else {
3748                         list_add(&handler_dst->list, &handler->list);
3749                 }
3750         }
3751
3752         return handler;
3753 }
3754 enum {
3755         LEFTOVERS_MC,
3756         LEFTOVERS_UC,
3757 };
3758
3759 static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
3760                                                           struct mlx5_ib_flow_prio *ft_prio,
3761                                                           struct ib_flow_attr *flow_attr,
3762                                                           struct mlx5_flow_destination *dst)
3763 {
3764         struct mlx5_ib_flow_handler *handler_ucast = NULL;
3765         struct mlx5_ib_flow_handler *handler = NULL;
3766
3767         static struct {
3768                 struct ib_flow_attr     flow_attr;
3769                 struct ib_flow_spec_eth eth_flow;
3770         } leftovers_specs[] = {
3771                 [LEFTOVERS_MC] = {
3772                         .flow_attr = {
3773                                 .num_of_specs = 1,
3774                                 .size = sizeof(leftovers_specs[0])
3775                         },
3776                         .eth_flow = {
3777                                 .type = IB_FLOW_SPEC_ETH,
3778                                 .size = sizeof(struct ib_flow_spec_eth),
3779                                 .mask = {.dst_mac = {0x1} },
3780                                 .val =  {.dst_mac = {0x1} }
3781                         }
3782                 },
3783                 [LEFTOVERS_UC] = {
3784                         .flow_attr = {
3785                                 .num_of_specs = 1,
3786                                 .size = sizeof(leftovers_specs[0])
3787                         },
3788                         .eth_flow = {
3789                                 .type = IB_FLOW_SPEC_ETH,
3790                                 .size = sizeof(struct ib_flow_spec_eth),
3791                                 .mask = {.dst_mac = {0x1} },
3792                                 .val = {.dst_mac = {} }
3793                         }
3794                 }
3795         };
3796
3797         handler = create_flow_rule(dev, ft_prio,
3798                                    &leftovers_specs[LEFTOVERS_MC].flow_attr,
3799                                    dst);
3800         if (!IS_ERR(handler) &&
3801             flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
3802                 handler_ucast = create_flow_rule(dev, ft_prio,
3803                                                  &leftovers_specs[LEFTOVERS_UC].flow_attr,
3804                                                  dst);
3805                 if (IS_ERR(handler_ucast)) {
3806                         mlx5_del_flow_rules(handler->rule);
3807                         ft_prio->refcount--;
3808                         kfree(handler);
3809                         handler = handler_ucast;
3810                 } else {
3811                         list_add(&handler_ucast->list, &handler->list);
3812                 }
3813         }
3814
3815         return handler;
3816 }
3817
3818 static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
3819                                                         struct mlx5_ib_flow_prio *ft_rx,
3820                                                         struct mlx5_ib_flow_prio *ft_tx,
3821                                                         struct mlx5_flow_destination *dst)
3822 {
3823         struct mlx5_ib_flow_handler *handler_rx;
3824         struct mlx5_ib_flow_handler *handler_tx;
3825         int err;
3826         static const struct ib_flow_attr flow_attr  = {
3827                 .num_of_specs = 0,
3828                 .size = sizeof(flow_attr)
3829         };
3830
3831         handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
3832         if (IS_ERR(handler_rx)) {
3833                 err = PTR_ERR(handler_rx);
3834                 goto err;
3835         }
3836
3837         handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
3838         if (IS_ERR(handler_tx)) {
3839                 err = PTR_ERR(handler_tx);
3840                 goto err_tx;
3841         }
3842
3843         list_add(&handler_tx->list, &handler_rx->list);
3844
3845         return handler_rx;
3846
3847 err_tx:
3848         mlx5_del_flow_rules(handler_rx->rule);
3849         ft_rx->refcount--;
3850         kfree(handler_rx);
3851 err:
3852         return ERR_PTR(err);
3853 }
3854
3855 static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
3856                                            struct ib_flow_attr *flow_attr,
3857                                            int domain,
3858                                            struct ib_udata *udata)
3859 {
3860         struct mlx5_ib_dev *dev = to_mdev(qp->device);
3861         struct mlx5_ib_qp *mqp = to_mqp(qp);
3862         struct mlx5_ib_flow_handler *handler = NULL;
3863         struct mlx5_flow_destination *dst = NULL;
3864         struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
3865         struct mlx5_ib_flow_prio *ft_prio;
3866         bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
3867         struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr;
3868         size_t min_ucmd_sz, required_ucmd_sz;
3869         int err;
3870         int underlay_qpn;
3871
3872         if (udata && udata->inlen) {
3873                 min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) +
3874                                 sizeof(ucmd_hdr.reserved);
3875                 if (udata->inlen < min_ucmd_sz)
3876                         return ERR_PTR(-EOPNOTSUPP);
3877
3878                 err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz);
3879                 if (err)
3880                         return ERR_PTR(err);
3881
3882                 /* currently supports only one counters data */
3883                 if (ucmd_hdr.ncounters_data > 1)
3884                         return ERR_PTR(-EINVAL);
3885
3886                 required_ucmd_sz = min_ucmd_sz +
3887                         sizeof(struct mlx5_ib_flow_counters_data) *
3888                         ucmd_hdr.ncounters_data;
3889                 if (udata->inlen > required_ucmd_sz &&
3890                     !ib_is_udata_cleared(udata, required_ucmd_sz,
3891                                          udata->inlen - required_ucmd_sz))
3892                         return ERR_PTR(-EOPNOTSUPP);
3893
3894                 ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL);
3895                 if (!ucmd)
3896                         return ERR_PTR(-ENOMEM);
3897
3898                 err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz);
3899                 if (err)
3900                         goto free_ucmd;
3901         }
3902
3903         if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) {
3904                 err = -ENOMEM;
3905                 goto free_ucmd;
3906         }
3907
3908         if (domain != IB_FLOW_DOMAIN_USER ||
3909             flow_attr->port > dev->num_ports ||
3910             (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP |
3911                                   IB_FLOW_ATTR_FLAGS_EGRESS))) {
3912                 err = -EINVAL;
3913                 goto free_ucmd;
3914         }
3915
3916         if (is_egress &&
3917             (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3918              flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
3919                 err = -EINVAL;
3920                 goto free_ucmd;
3921         }
3922
3923         dst = kzalloc(sizeof(*dst), GFP_KERNEL);
3924         if (!dst) {
3925                 err = -ENOMEM;
3926                 goto free_ucmd;
3927         }
3928
3929         mutex_lock(&dev->flow_db->lock);
3930
3931         ft_prio = get_flow_table(dev, flow_attr,
3932                                  is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX);
3933         if (IS_ERR(ft_prio)) {
3934                 err = PTR_ERR(ft_prio);
3935                 goto unlock;
3936         }
3937         if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3938                 ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
3939                 if (IS_ERR(ft_prio_tx)) {
3940                         err = PTR_ERR(ft_prio_tx);
3941                         ft_prio_tx = NULL;
3942                         goto destroy_ft;
3943                 }
3944         }
3945
3946         if (is_egress) {
3947                 dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT;
3948         } else {
3949                 dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
3950                 if (mqp->flags & MLX5_IB_QP_RSS)
3951                         dst->tir_num = mqp->rss_qp.tirn;
3952                 else
3953                         dst->tir_num = mqp->raw_packet_qp.rq.tirn;
3954         }
3955
3956         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
3957                 if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
3958                         handler = create_dont_trap_rule(dev, ft_prio,
3959                                                         flow_attr, dst);
3960                 } else {
3961                         underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ?
3962                                         mqp->underlay_qpn : 0;
3963                         handler = _create_flow_rule(dev, ft_prio, flow_attr,
3964                                                     dst, underlay_qpn, ucmd);
3965                 }
3966         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3967                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
3968                 handler = create_leftovers_rule(dev, ft_prio, flow_attr,
3969                                                 dst);
3970         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3971                 handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
3972         } else {
3973                 err = -EINVAL;
3974                 goto destroy_ft;
3975         }
3976
3977         if (IS_ERR(handler)) {
3978                 err = PTR_ERR(handler);
3979                 handler = NULL;
3980                 goto destroy_ft;
3981         }
3982
3983         mutex_unlock(&dev->flow_db->lock);
3984         kfree(dst);
3985         kfree(ucmd);
3986
3987         return &handler->ibflow;
3988
3989 destroy_ft:
3990         put_flow_table(dev, ft_prio, false);
3991         if (ft_prio_tx)
3992                 put_flow_table(dev, ft_prio_tx, false);
3993 unlock:
3994         mutex_unlock(&dev->flow_db->lock);
3995         kfree(dst);
3996 free_ucmd:
3997         kfree(ucmd);
3998         return ERR_PTR(err);
3999 }
4000
4001 static struct mlx5_ib_flow_prio *
4002 _get_flow_table(struct mlx5_ib_dev *dev,
4003                 struct mlx5_ib_flow_matcher *fs_matcher,
4004                 bool mcast)
4005 {
4006         struct mlx5_flow_namespace *ns = NULL;
4007         struct mlx5_ib_flow_prio *prio = NULL;
4008         int max_table_size = 0;
4009         bool esw_encap;
4010         u32 flags = 0;
4011         int priority;
4012
4013         if (mcast)
4014                 priority = MLX5_IB_FLOW_MCAST_PRIO;
4015         else
4016                 priority = ib_prio_to_core_prio(fs_matcher->priority, false);
4017
4018         esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
4019                 DEVLINK_ESWITCH_ENCAP_MODE_NONE;
4020         if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
4021                 max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
4022                                         log_max_ft_size));
4023                 if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap)
4024                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
4025                 if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
4026                                               reformat_l3_tunnel_to_l2) &&
4027                     !esw_encap)
4028                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
4029         } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) {
4030                 max_table_size = BIT(
4031                         MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size));
4032                 if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) && !esw_encap)
4033                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
4034         } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) {
4035                 max_table_size = BIT(
4036                         MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size));
4037                 if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap)
4038                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
4039                 if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, reformat_l3_tunnel_to_l2) &&
4040                     esw_encap)
4041                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
4042                 priority = FDB_BYPASS_PATH;
4043         } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) {
4044                 max_table_size =
4045                         BIT(MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev,
4046                                                        log_max_ft_size));
4047                 priority = fs_matcher->priority;
4048         }
4049
4050         max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
4051
4052         ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
4053         if (!ns)
4054                 return ERR_PTR(-ENOTSUPP);
4055
4056         if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS)
4057                 prio = &dev->flow_db->prios[priority];
4058         else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
4059                 prio = &dev->flow_db->egress_prios[priority];
4060         else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB)
4061                 prio = &dev->flow_db->fdb;
4062         else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX)
4063                 prio = &dev->flow_db->rdma_rx[priority];
4064
4065         if (!prio)
4066                 return ERR_PTR(-EINVAL);
4067
4068         if (prio->flow_table)
4069                 return prio;
4070
4071         return _get_prio(ns, prio, priority, max_table_size,
4072                          MLX5_FS_MAX_TYPES, flags);
4073 }
4074
4075 static struct mlx5_ib_flow_handler *
4076 _create_raw_flow_rule(struct mlx5_ib_dev *dev,
4077                       struct mlx5_ib_flow_prio *ft_prio,
4078                       struct mlx5_flow_destination *dst,
4079                       struct mlx5_ib_flow_matcher  *fs_matcher,
4080                       struct mlx5_flow_context *flow_context,
4081                       struct mlx5_flow_act *flow_act,
4082                       void *cmd_in, int inlen,
4083                       int dst_num)
4084 {
4085         struct mlx5_ib_flow_handler *handler;
4086         struct mlx5_flow_spec *spec;
4087         struct mlx5_flow_table *ft = ft_prio->flow_table;
4088         int err = 0;
4089
4090         spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
4091         handler = kzalloc(sizeof(*handler), GFP_KERNEL);
4092         if (!handler || !spec) {
4093                 err = -ENOMEM;
4094                 goto free;
4095         }
4096
4097         INIT_LIST_HEAD(&handler->list);
4098
4099         memcpy(spec->match_value, cmd_in, inlen);
4100         memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params,
4101                fs_matcher->mask_len);
4102         spec->match_criteria_enable = fs_matcher->match_criteria_enable;
4103         spec->flow_context = *flow_context;
4104
4105         handler->rule = mlx5_add_flow_rules(ft, spec,
4106                                             flow_act, dst, dst_num);
4107
4108         if (IS_ERR(handler->rule)) {
4109                 err = PTR_ERR(handler->rule);
4110                 goto free;
4111         }
4112
4113         ft_prio->refcount++;
4114         handler->prio = ft_prio;
4115         handler->dev = dev;
4116         ft_prio->flow_table = ft;
4117
4118 free:
4119         if (err)
4120                 kfree(handler);
4121         kvfree(spec);
4122         return err ? ERR_PTR(err) : handler;
4123 }
4124
4125 static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
4126                                 void *match_v)
4127 {
4128         void *match_c;
4129         void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4;
4130         void *dmac, *dmac_mask;
4131         void *ipv4, *ipv4_mask;
4132
4133         if (!(fs_matcher->match_criteria_enable &
4134               (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT)))
4135                 return false;
4136
4137         match_c = fs_matcher->matcher_mask.match_params;
4138         match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v,
4139                                            outer_headers);
4140         match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c,
4141                                            outer_headers);
4142
4143         dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
4144                             dmac_47_16);
4145         dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
4146                                  dmac_47_16);
4147
4148         if (is_multicast_ether_addr(dmac) &&
4149             is_multicast_ether_addr(dmac_mask))
4150                 return true;
4151
4152         ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
4153                             dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
4154
4155         ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
4156                                  dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
4157
4158         if (ipv4_is_multicast(*(__be32 *)(ipv4)) &&
4159             ipv4_is_multicast(*(__be32 *)(ipv4_mask)))
4160                 return true;
4161
4162         return false;
4163 }
4164
4165 struct mlx5_ib_flow_handler *
4166 mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
4167                         struct mlx5_ib_flow_matcher *fs_matcher,
4168                         struct mlx5_flow_context *flow_context,
4169                         struct mlx5_flow_act *flow_act,
4170                         u32 counter_id,
4171                         void *cmd_in, int inlen, int dest_id,
4172                         int dest_type)
4173 {
4174         struct mlx5_flow_destination *dst;
4175         struct mlx5_ib_flow_prio *ft_prio;
4176         struct mlx5_ib_flow_handler *handler;
4177         int dst_num = 0;
4178         bool mcast;
4179         int err;
4180
4181         if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL)
4182                 return ERR_PTR(-EOPNOTSUPP);
4183
4184         if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
4185                 return ERR_PTR(-ENOMEM);
4186
4187         dst = kcalloc(2, sizeof(*dst), GFP_KERNEL);
4188         if (!dst)
4189                 return ERR_PTR(-ENOMEM);
4190
4191         mcast = raw_fs_is_multicast(fs_matcher, cmd_in);
4192         mutex_lock(&dev->flow_db->lock);
4193
4194         ft_prio = _get_flow_table(dev, fs_matcher, mcast);
4195         if (IS_ERR(ft_prio)) {
4196                 err = PTR_ERR(ft_prio);
4197                 goto unlock;
4198         }
4199
4200         if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) {
4201                 dst[dst_num].type = dest_type;
4202                 dst[dst_num].tir_num = dest_id;
4203                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
4204         } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
4205                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
4206                 dst[dst_num].ft_num = dest_id;
4207                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
4208         } else {
4209                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_PORT;
4210                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
4211         }
4212
4213         dst_num++;
4214
4215         if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
4216                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
4217                 dst[dst_num].counter_id = counter_id;
4218                 dst_num++;
4219         }
4220
4221         handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher,
4222                                         flow_context, flow_act,
4223                                         cmd_in, inlen, dst_num);
4224
4225         if (IS_ERR(handler)) {
4226                 err = PTR_ERR(handler);
4227                 goto destroy_ft;
4228         }
4229
4230         mutex_unlock(&dev->flow_db->lock);
4231         atomic_inc(&fs_matcher->usecnt);
4232         handler->flow_matcher = fs_matcher;
4233
4234         kfree(dst);
4235
4236         return handler;
4237
4238 destroy_ft:
4239         put_flow_table(dev, ft_prio, false);
4240 unlock:
4241         mutex_unlock(&dev->flow_db->lock);
4242         kfree(dst);
4243
4244         return ERR_PTR(err);
4245 }
4246
4247 static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags)
4248 {
4249         u32 flags = 0;
4250
4251         if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA)
4252                 flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA;
4253
4254         return flags;
4255 }
4256
4257 #define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED      MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA
4258 static struct ib_flow_action *
4259 mlx5_ib_create_flow_action_esp(struct ib_device *device,
4260                                const struct ib_flow_action_attrs_esp *attr,
4261                                struct uverbs_attr_bundle *attrs)
4262 {
4263         struct mlx5_ib_dev *mdev = to_mdev(device);
4264         struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm;
4265         struct mlx5_accel_esp_xfrm_attrs accel_attrs = {};
4266         struct mlx5_ib_flow_action *action;
4267         u64 action_flags;
4268         u64 flags;
4269         int err = 0;
4270
4271         err = uverbs_get_flags64(
4272                 &action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
4273                 ((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1));
4274         if (err)
4275                 return ERR_PTR(err);
4276
4277         flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags);
4278
4279         /* We current only support a subset of the standard features. Only a
4280          * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn
4281          * (with overlap). Full offload mode isn't supported.
4282          */
4283         if (!attr->keymat || attr->replay || attr->encap ||
4284             attr->spi || attr->seq || attr->tfc_pad ||
4285             attr->hard_limit_pkts ||
4286             (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
4287                              IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)))
4288                 return ERR_PTR(-EOPNOTSUPP);
4289
4290         if (attr->keymat->protocol !=
4291             IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM)
4292                 return ERR_PTR(-EOPNOTSUPP);
4293
4294         aes_gcm = &attr->keymat->keymat.aes_gcm;
4295
4296         if (aes_gcm->icv_len != 16 ||
4297             aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
4298                 return ERR_PTR(-EOPNOTSUPP);
4299
4300         action = kmalloc(sizeof(*action), GFP_KERNEL);
4301         if (!action)
4302                 return ERR_PTR(-ENOMEM);
4303
4304         action->esp_aes_gcm.ib_flags = attr->flags;
4305         memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key,
4306                sizeof(accel_attrs.keymat.aes_gcm.aes_key));
4307         accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8;
4308         memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt,
4309                sizeof(accel_attrs.keymat.aes_gcm.salt));
4310         memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv,
4311                sizeof(accel_attrs.keymat.aes_gcm.seq_iv));
4312         accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8;
4313         accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ;
4314         accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM;
4315
4316         accel_attrs.esn = attr->esn;
4317         if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED)
4318                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
4319         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
4320                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
4321
4322         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)
4323                 accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT;
4324
4325         action->esp_aes_gcm.ctx =
4326                 mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags);
4327         if (IS_ERR(action->esp_aes_gcm.ctx)) {
4328                 err = PTR_ERR(action->esp_aes_gcm.ctx);
4329                 goto err_parse;
4330         }
4331
4332         action->esp_aes_gcm.ib_flags = attr->flags;
4333
4334         return &action->ib_action;
4335
4336 err_parse:
4337         kfree(action);
4338         return ERR_PTR(err);
4339 }
4340
4341 static int
4342 mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action,
4343                                const struct ib_flow_action_attrs_esp *attr,
4344                                struct uverbs_attr_bundle *attrs)
4345 {
4346         struct mlx5_ib_flow_action *maction = to_mflow_act(action);
4347         struct mlx5_accel_esp_xfrm_attrs accel_attrs;
4348         int err = 0;
4349
4350         if (attr->keymat || attr->replay || attr->encap ||
4351             attr->spi || attr->seq || attr->tfc_pad ||
4352             attr->hard_limit_pkts ||
4353             (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
4354                              IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS |
4355                              IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)))
4356                 return -EOPNOTSUPP;
4357
4358         /* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can
4359          * be modified.
4360          */
4361         if (!(maction->esp_aes_gcm.ib_flags &
4362               IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) &&
4363             attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
4364                            IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))
4365                 return -EINVAL;
4366
4367         memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs,
4368                sizeof(accel_attrs));
4369
4370         accel_attrs.esn = attr->esn;
4371         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
4372                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
4373         else
4374                 accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
4375
4376         err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx,
4377                                          &accel_attrs);
4378         if (err)
4379                 return err;
4380
4381         maction->esp_aes_gcm.ib_flags &=
4382                 ~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
4383         maction->esp_aes_gcm.ib_flags |=
4384                 attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
4385
4386         return 0;
4387 }
4388
4389 static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action)
4390 {
4391         struct mlx5_ib_flow_action *maction = to_mflow_act(action);
4392
4393         switch (action->type) {
4394         case IB_FLOW_ACTION_ESP:
4395                 /*
4396                  * We only support aes_gcm by now, so we implicitly know this is
4397                  * the underline crypto.
4398                  */
4399                 mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx);
4400                 break;
4401         case IB_FLOW_ACTION_UNSPECIFIED:
4402                 mlx5_ib_destroy_flow_action_raw(maction);
4403                 break;
4404         default:
4405                 WARN_ON(true);
4406                 break;
4407         }
4408
4409         kfree(maction);
4410         return 0;
4411 }
4412
4413 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
4414 {
4415         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
4416         struct mlx5_ib_qp *mqp = to_mqp(ibqp);
4417         int err;
4418         u16 uid;
4419
4420         uid = ibqp->pd ?
4421                 to_mpd(ibqp->pd)->uid : 0;
4422
4423         if (mqp->flags & MLX5_IB_QP_UNDERLAY) {
4424                 mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n");
4425                 return -EOPNOTSUPP;
4426         }
4427
4428         err = mlx5_cmd_attach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
4429         if (err)
4430                 mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
4431                              ibqp->qp_num, gid->raw);
4432
4433         return err;
4434 }
4435
4436 static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
4437 {
4438         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
4439         int err;
4440         u16 uid;
4441
4442         uid = ibqp->pd ?
4443                 to_mpd(ibqp->pd)->uid : 0;
4444         err = mlx5_cmd_detach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
4445         if (err)
4446                 mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
4447                              ibqp->qp_num, gid->raw);
4448
4449         return err;
4450 }
4451
4452 static int init_node_data(struct mlx5_ib_dev *dev)
4453 {
4454         int err;
4455
4456         err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
4457         if (err)
4458                 return err;
4459
4460         dev->mdev->rev_id = dev->mdev->pdev->revision;
4461
4462         return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
4463 }
4464
4465 static ssize_t fw_pages_show(struct device *device,
4466                              struct device_attribute *attr, char *buf)
4467 {
4468         struct mlx5_ib_dev *dev =
4469                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4470
4471         return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
4472 }
4473 static DEVICE_ATTR_RO(fw_pages);
4474
4475 static ssize_t reg_pages_show(struct device *device,
4476                               struct device_attribute *attr, char *buf)
4477 {
4478         struct mlx5_ib_dev *dev =
4479                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4480
4481         return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
4482 }
4483 static DEVICE_ATTR_RO(reg_pages);
4484
4485 static ssize_t hca_type_show(struct device *device,
4486                              struct device_attribute *attr, char *buf)
4487 {
4488         struct mlx5_ib_dev *dev =
4489                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4490
4491         return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
4492 }
4493 static DEVICE_ATTR_RO(hca_type);
4494
4495 static ssize_t hw_rev_show(struct device *device,
4496                            struct device_attribute *attr, char *buf)
4497 {
4498         struct mlx5_ib_dev *dev =
4499                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4500
4501         return sprintf(buf, "%x\n", dev->mdev->rev_id);
4502 }
4503 static DEVICE_ATTR_RO(hw_rev);
4504
4505 static ssize_t board_id_show(struct device *device,
4506                              struct device_attribute *attr, char *buf)
4507 {
4508         struct mlx5_ib_dev *dev =
4509                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4510
4511         return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
4512                        dev->mdev->board_id);
4513 }
4514 static DEVICE_ATTR_RO(board_id);
4515
4516 static struct attribute *mlx5_class_attributes[] = {
4517         &dev_attr_hw_rev.attr,
4518         &dev_attr_hca_type.attr,
4519         &dev_attr_board_id.attr,
4520         &dev_attr_fw_pages.attr,
4521         &dev_attr_reg_pages.attr,
4522         NULL,
4523 };
4524
4525 static const struct attribute_group mlx5_attr_group = {
4526         .attrs = mlx5_class_attributes,
4527 };
4528
4529 static void pkey_change_handler(struct work_struct *work)
4530 {
4531         struct mlx5_ib_port_resources *ports =
4532                 container_of(work, struct mlx5_ib_port_resources,
4533                              pkey_change_work);
4534
4535         mutex_lock(&ports->devr->mutex);
4536         mlx5_ib_gsi_pkey_change(ports->gsi);
4537         mutex_unlock(&ports->devr->mutex);
4538 }
4539
4540 static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
4541 {
4542         struct mlx5_ib_qp *mqp;
4543         struct mlx5_ib_cq *send_mcq, *recv_mcq;
4544         struct mlx5_core_cq *mcq;
4545         struct list_head cq_armed_list;
4546         unsigned long flags_qp;
4547         unsigned long flags_cq;
4548         unsigned long flags;
4549
4550         INIT_LIST_HEAD(&cq_armed_list);
4551
4552         /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
4553         spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
4554         list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
4555                 spin_lock_irqsave(&mqp->sq.lock, flags_qp);
4556                 if (mqp->sq.tail != mqp->sq.head) {
4557                         send_mcq = to_mcq(mqp->ibqp.send_cq);
4558                         spin_lock_irqsave(&send_mcq->lock, flags_cq);
4559                         if (send_mcq->mcq.comp &&
4560                             mqp->ibqp.send_cq->comp_handler) {
4561                                 if (!send_mcq->mcq.reset_notify_added) {
4562                                         send_mcq->mcq.reset_notify_added = 1;
4563                                         list_add_tail(&send_mcq->mcq.reset_notify,
4564                                                       &cq_armed_list);
4565                                 }
4566                         }
4567                         spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
4568                 }
4569                 spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
4570                 spin_lock_irqsave(&mqp->rq.lock, flags_qp);
4571                 /* no handling is needed for SRQ */
4572                 if (!mqp->ibqp.srq) {
4573                         if (mqp->rq.tail != mqp->rq.head) {
4574                                 recv_mcq = to_mcq(mqp->ibqp.recv_cq);
4575                                 spin_lock_irqsave(&recv_mcq->lock, flags_cq);
4576                                 if (recv_mcq->mcq.comp &&
4577                                     mqp->ibqp.recv_cq->comp_handler) {
4578                                         if (!recv_mcq->mcq.reset_notify_added) {
4579                                                 recv_mcq->mcq.reset_notify_added = 1;
4580                                                 list_add_tail(&recv_mcq->mcq.reset_notify,
4581                                                               &cq_armed_list);
4582                                         }
4583                                 }
4584                                 spin_unlock_irqrestore(&recv_mcq->lock,
4585                                                        flags_cq);
4586                         }
4587                 }
4588                 spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
4589         }
4590         /*At that point all inflight post send were put to be executed as of we
4591          * lock/unlock above locks Now need to arm all involved CQs.
4592          */
4593         list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
4594                 mcq->comp(mcq, NULL);
4595         }
4596         spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
4597 }
4598
4599 static void delay_drop_handler(struct work_struct *work)
4600 {
4601         int err;
4602         struct mlx5_ib_delay_drop *delay_drop =
4603                 container_of(work, struct mlx5_ib_delay_drop,
4604                              delay_drop_work);
4605
4606         atomic_inc(&delay_drop->events_cnt);
4607
4608         mutex_lock(&delay_drop->lock);
4609         err = mlx5_core_set_delay_drop(delay_drop->dev->mdev,
4610                                        delay_drop->timeout);
4611         if (err) {
4612                 mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n",
4613                              delay_drop->timeout);
4614                 delay_drop->activate = false;
4615         }
4616         mutex_unlock(&delay_drop->lock);
4617 }
4618
4619 static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
4620                                  struct ib_event *ibev)
4621 {
4622         u8 port = (eqe->data.port.port >> 4) & 0xf;
4623
4624         switch (eqe->sub_type) {
4625         case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
4626                 if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
4627                                             IB_LINK_LAYER_ETHERNET)
4628                         schedule_work(&ibdev->delay_drop.delay_drop_work);
4629                 break;
4630         default: /* do nothing */
4631                 return;
4632         }
4633 }
4634
4635 static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
4636                               struct ib_event *ibev)
4637 {
4638         u8 port = (eqe->data.port.port >> 4) & 0xf;
4639
4640         ibev->element.port_num = port;
4641
4642         switch (eqe->sub_type) {
4643         case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
4644         case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
4645         case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
4646                 /* In RoCE, port up/down events are handled in
4647                  * mlx5_netdev_event().
4648                  */
4649                 if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
4650                                             IB_LINK_LAYER_ETHERNET)
4651                         return -EINVAL;
4652
4653                 ibev->event = (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) ?
4654                                 IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
4655                 break;
4656
4657         case MLX5_PORT_CHANGE_SUBTYPE_LID:
4658                 ibev->event = IB_EVENT_LID_CHANGE;
4659                 break;
4660
4661         case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
4662                 ibev->event = IB_EVENT_PKEY_CHANGE;
4663                 schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
4664                 break;
4665
4666         case MLX5_PORT_CHANGE_SUBTYPE_GUID:
4667                 ibev->event = IB_EVENT_GID_CHANGE;
4668                 break;
4669
4670         case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
4671                 ibev->event = IB_EVENT_CLIENT_REREGISTER;
4672                 break;
4673         default:
4674                 return -EINVAL;
4675         }
4676
4677         return 0;
4678 }
4679
4680 static void mlx5_ib_handle_event(struct work_struct *_work)
4681 {
4682         struct mlx5_ib_event_work *work =
4683                 container_of(_work, struct mlx5_ib_event_work, work);
4684         struct mlx5_ib_dev *ibdev;
4685         struct ib_event ibev;
4686         bool fatal = false;
4687
4688         if (work->is_slave) {
4689                 ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
4690                 if (!ibdev)
4691                         goto out;
4692         } else {
4693                 ibdev = work->dev;
4694         }
4695
4696         switch (work->event) {
4697         case MLX5_DEV_EVENT_SYS_ERROR:
4698                 ibev.event = IB_EVENT_DEVICE_FATAL;
4699                 mlx5_ib_handle_internal_error(ibdev);
4700                 ibev.element.port_num  = (u8)(unsigned long)work->param;
4701                 fatal = true;
4702                 break;
4703         case MLX5_EVENT_TYPE_PORT_CHANGE:
4704                 if (handle_port_change(ibdev, work->param, &ibev))
4705                         goto out;
4706                 break;
4707         case MLX5_EVENT_TYPE_GENERAL_EVENT:
4708                 handle_general_event(ibdev, work->param, &ibev);
4709                 /* fall through */
4710         default:
4711                 goto out;
4712         }
4713
4714         ibev.device = &ibdev->ib_dev;
4715
4716         if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) {
4717                 mlx5_ib_warn(ibdev, "warning: event on port %d\n",  ibev.element.port_num);
4718                 goto out;
4719         }
4720
4721         if (ibdev->ib_active)
4722                 ib_dispatch_event(&ibev);
4723
4724         if (fatal)
4725                 ibdev->ib_active = false;
4726 out:
4727         kfree(work);
4728 }
4729
4730 static int mlx5_ib_event(struct notifier_block *nb,
4731                          unsigned long event, void *param)
4732 {
4733         struct mlx5_ib_event_work *work;
4734
4735         work = kmalloc(sizeof(*work), GFP_ATOMIC);
4736         if (!work)
4737                 return NOTIFY_DONE;
4738
4739         INIT_WORK(&work->work, mlx5_ib_handle_event);
4740         work->dev = container_of(nb, struct mlx5_ib_dev, mdev_events);
4741         work->is_slave = false;
4742         work->param = param;
4743         work->event = event;
4744
4745         queue_work(mlx5_ib_event_wq, &work->work);
4746
4747         return NOTIFY_OK;
4748 }
4749
4750 static int mlx5_ib_event_slave_port(struct notifier_block *nb,
4751                                     unsigned long event, void *param)
4752 {
4753         struct mlx5_ib_event_work *work;
4754
4755         work = kmalloc(sizeof(*work), GFP_ATOMIC);
4756         if (!work)
4757                 return NOTIFY_DONE;
4758
4759         INIT_WORK(&work->work, mlx5_ib_handle_event);
4760         work->mpi = container_of(nb, struct mlx5_ib_multiport_info, mdev_events);
4761         work->is_slave = true;
4762         work->param = param;
4763         work->event = event;
4764         queue_work(mlx5_ib_event_wq, &work->work);
4765
4766         return NOTIFY_OK;
4767 }
4768
4769 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
4770 {
4771         struct mlx5_hca_vport_context vport_ctx;
4772         int err;
4773         int port;
4774
4775         for (port = 1; port <= ARRAY_SIZE(dev->mdev->port_caps); port++) {
4776                 dev->mdev->port_caps[port - 1].has_smi = false;
4777                 if (MLX5_CAP_GEN(dev->mdev, port_type) ==
4778                     MLX5_CAP_PORT_TYPE_IB) {
4779                         if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
4780                                 err = mlx5_query_hca_vport_context(dev->mdev, 0,
4781                                                                    port, 0,
4782                                                                    &vport_ctx);
4783                                 if (err) {
4784                                         mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
4785                                                     port, err);
4786                                         return err;
4787                                 }
4788                                 dev->mdev->port_caps[port - 1].has_smi =
4789                                         vport_ctx.has_smi;
4790                         } else {
4791                                 dev->mdev->port_caps[port - 1].has_smi = true;
4792                         }
4793                 }
4794         }
4795         return 0;
4796 }
4797
4798 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
4799 {
4800         int port;
4801
4802         for (port = 1; port <= dev->num_ports; port++)
4803                 mlx5_query_ext_port_caps(dev, port);
4804 }
4805
4806 static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
4807 {
4808         struct ib_device_attr *dprops = NULL;
4809         struct ib_port_attr *pprops = NULL;
4810         int err = -ENOMEM;
4811
4812         pprops = kzalloc(sizeof(*pprops), GFP_KERNEL);
4813         if (!pprops)
4814                 goto out;
4815
4816         dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
4817         if (!dprops)
4818                 goto out;
4819
4820         err = mlx5_ib_query_device(&dev->ib_dev, dprops, NULL);
4821         if (err) {
4822                 mlx5_ib_warn(dev, "query_device failed %d\n", err);
4823                 goto out;
4824         }
4825
4826         err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
4827         if (err) {
4828                 mlx5_ib_warn(dev, "query_port %d failed %d\n",
4829                              port, err);
4830                 goto out;
4831         }
4832
4833         dev->mdev->port_caps[port - 1].pkey_table_len =
4834                                         dprops->max_pkeys;
4835         dev->mdev->port_caps[port - 1].gid_table_len =
4836                                         pprops->gid_tbl_len;
4837         mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n",
4838                     port, dprops->max_pkeys, pprops->gid_tbl_len);
4839
4840 out:
4841         kfree(pprops);
4842         kfree(dprops);
4843
4844         return err;
4845 }
4846
4847 static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
4848 {
4849         /* For representors use port 1, is this is the only native
4850          * port
4851          */
4852         if (dev->is_rep)
4853                 return __get_port_caps(dev, 1);
4854         return __get_port_caps(dev, port);
4855 }
4856
4857 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
4858 {
4859         int err;
4860
4861         err = mlx5_mr_cache_cleanup(dev);
4862         if (err)
4863                 mlx5_ib_warn(dev, "mr cache cleanup failed\n");
4864
4865         if (dev->umrc.qp)
4866                 mlx5_ib_destroy_qp(dev->umrc.qp, NULL);
4867         if (dev->umrc.cq)
4868                 ib_free_cq(dev->umrc.cq);
4869         if (dev->umrc.pd)
4870                 ib_dealloc_pd(dev->umrc.pd);
4871 }
4872
4873 enum {
4874         MAX_UMR_WR = 128,
4875 };
4876
4877 static int create_umr_res(struct mlx5_ib_dev *dev)
4878 {
4879         struct ib_qp_init_attr *init_attr = NULL;
4880         struct ib_qp_attr *attr = NULL;
4881         struct ib_pd *pd;
4882         struct ib_cq *cq;
4883         struct ib_qp *qp;
4884         int ret;
4885
4886         attr = kzalloc(sizeof(*attr), GFP_KERNEL);
4887         init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
4888         if (!attr || !init_attr) {
4889                 ret = -ENOMEM;
4890                 goto error_0;
4891         }
4892
4893         pd = ib_alloc_pd(&dev->ib_dev, 0);
4894         if (IS_ERR(pd)) {
4895                 mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
4896                 ret = PTR_ERR(pd);
4897                 goto error_0;
4898         }
4899
4900         cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
4901         if (IS_ERR(cq)) {
4902                 mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
4903                 ret = PTR_ERR(cq);
4904                 goto error_2;
4905         }
4906
4907         init_attr->send_cq = cq;
4908         init_attr->recv_cq = cq;
4909         init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
4910         init_attr->cap.max_send_wr = MAX_UMR_WR;
4911         init_attr->cap.max_send_sge = 1;
4912         init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
4913         init_attr->port_num = 1;
4914         qp = mlx5_ib_create_qp(pd, init_attr, NULL);
4915         if (IS_ERR(qp)) {
4916                 mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
4917                 ret = PTR_ERR(qp);
4918                 goto error_3;
4919         }
4920         qp->device     = &dev->ib_dev;
4921         qp->real_qp    = qp;
4922         qp->uobject    = NULL;
4923         qp->qp_type    = MLX5_IB_QPT_REG_UMR;
4924         qp->send_cq    = init_attr->send_cq;
4925         qp->recv_cq    = init_attr->recv_cq;
4926
4927         attr->qp_state = IB_QPS_INIT;
4928         attr->port_num = 1;
4929         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
4930                                 IB_QP_PORT, NULL);
4931         if (ret) {
4932                 mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
4933                 goto error_4;
4934         }
4935
4936         memset(attr, 0, sizeof(*attr));
4937         attr->qp_state = IB_QPS_RTR;
4938         attr->path_mtu = IB_MTU_256;
4939
4940         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
4941         if (ret) {
4942                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
4943                 goto error_4;
4944         }
4945
4946         memset(attr, 0, sizeof(*attr));
4947         attr->qp_state = IB_QPS_RTS;
4948         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
4949         if (ret) {
4950                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
4951                 goto error_4;
4952         }
4953
4954         dev->umrc.qp = qp;
4955         dev->umrc.cq = cq;
4956         dev->umrc.pd = pd;
4957
4958         sema_init(&dev->umrc.sem, MAX_UMR_WR);
4959         ret = mlx5_mr_cache_init(dev);
4960         if (ret) {
4961                 mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
4962                 goto error_4;
4963         }
4964
4965         kfree(attr);
4966         kfree(init_attr);
4967
4968         return 0;
4969
4970 error_4:
4971         mlx5_ib_destroy_qp(qp, NULL);
4972         dev->umrc.qp = NULL;
4973
4974 error_3:
4975         ib_free_cq(cq);
4976         dev->umrc.cq = NULL;
4977
4978 error_2:
4979         ib_dealloc_pd(pd);
4980         dev->umrc.pd = NULL;
4981
4982 error_0:
4983         kfree(attr);
4984         kfree(init_attr);
4985         return ret;
4986 }
4987
4988 static u8 mlx5_get_umr_fence(u8 umr_fence_cap)
4989 {
4990         switch (umr_fence_cap) {
4991         case MLX5_CAP_UMR_FENCE_NONE:
4992                 return MLX5_FENCE_MODE_NONE;
4993         case MLX5_CAP_UMR_FENCE_SMALL:
4994                 return MLX5_FENCE_MODE_INITIATOR_SMALL;
4995         default:
4996                 return MLX5_FENCE_MODE_STRONG_ORDERING;
4997         }
4998 }
4999
5000 static int create_dev_resources(struct mlx5_ib_resources *devr)
5001 {
5002         struct ib_srq_init_attr attr;
5003         struct mlx5_ib_dev *dev;
5004         struct ib_device *ibdev;
5005         struct ib_cq_init_attr cq_attr = {.cqe = 1};
5006         int port;
5007         int ret = 0;
5008
5009         dev = container_of(devr, struct mlx5_ib_dev, devr);
5010         ibdev = &dev->ib_dev;
5011
5012         mutex_init(&devr->mutex);
5013
5014         devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
5015         if (!devr->p0)
5016                 return -ENOMEM;
5017
5018         devr->p0->device  = ibdev;
5019         devr->p0->uobject = NULL;
5020         atomic_set(&devr->p0->usecnt, 0);
5021
5022         ret = mlx5_ib_alloc_pd(devr->p0, NULL);
5023         if (ret)
5024                 goto error0;
5025
5026         devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq);
5027         if (!devr->c0) {
5028                 ret = -ENOMEM;
5029                 goto error1;
5030         }
5031
5032         devr->c0->device = &dev->ib_dev;
5033         atomic_set(&devr->c0->usecnt, 0);
5034
5035         ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL);
5036         if (ret)
5037                 goto err_create_cq;
5038
5039         devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
5040         if (IS_ERR(devr->x0)) {
5041                 ret = PTR_ERR(devr->x0);
5042                 goto error2;
5043         }
5044         devr->x0->device = &dev->ib_dev;
5045         devr->x0->inode = NULL;
5046         atomic_set(&devr->x0->usecnt, 0);
5047         mutex_init(&devr->x0->tgt_qp_mutex);
5048         INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
5049
5050         devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
5051         if (IS_ERR(devr->x1)) {
5052                 ret = PTR_ERR(devr->x1);
5053                 goto error3;
5054         }
5055         devr->x1->device = &dev->ib_dev;
5056         devr->x1->inode = NULL;
5057         atomic_set(&devr->x1->usecnt, 0);
5058         mutex_init(&devr->x1->tgt_qp_mutex);
5059         INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
5060
5061         memset(&attr, 0, sizeof(attr));
5062         attr.attr.max_sge = 1;
5063         attr.attr.max_wr = 1;
5064         attr.srq_type = IB_SRQT_XRC;
5065         attr.ext.cq = devr->c0;
5066         attr.ext.xrc.xrcd = devr->x0;
5067
5068         devr->s0 = rdma_zalloc_drv_obj(ibdev, ib_srq);
5069         if (!devr->s0) {
5070                 ret = -ENOMEM;
5071                 goto error4;
5072         }
5073
5074         devr->s0->device        = &dev->ib_dev;
5075         devr->s0->pd            = devr->p0;
5076         devr->s0->srq_type      = IB_SRQT_XRC;
5077         devr->s0->ext.xrc.xrcd  = devr->x0;
5078         devr->s0->ext.cq        = devr->c0;
5079         ret = mlx5_ib_create_srq(devr->s0, &attr, NULL);
5080         if (ret)
5081                 goto err_create;
5082
5083         atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
5084         atomic_inc(&devr->s0->ext.cq->usecnt);
5085         atomic_inc(&devr->p0->usecnt);
5086         atomic_set(&devr->s0->usecnt, 0);
5087
5088         memset(&attr, 0, sizeof(attr));
5089         attr.attr.max_sge = 1;
5090         attr.attr.max_wr = 1;
5091         attr.srq_type = IB_SRQT_BASIC;
5092         devr->s1 = rdma_zalloc_drv_obj(ibdev, ib_srq);
5093         if (!devr->s1) {
5094                 ret = -ENOMEM;
5095                 goto error5;
5096         }
5097
5098         devr->s1->device        = &dev->ib_dev;
5099         devr->s1->pd            = devr->p0;
5100         devr->s1->srq_type      = IB_SRQT_BASIC;
5101         devr->s1->ext.cq        = devr->c0;
5102
5103         ret = mlx5_ib_create_srq(devr->s1, &attr, NULL);
5104         if (ret)
5105                 goto error6;
5106
5107         atomic_inc(&devr->p0->usecnt);
5108         atomic_set(&devr->s1->usecnt, 0);
5109
5110         for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
5111                 INIT_WORK(&devr->ports[port].pkey_change_work,
5112                           pkey_change_handler);
5113                 devr->ports[port].devr = devr;
5114         }
5115
5116         return 0;
5117
5118 error6:
5119         kfree(devr->s1);
5120 error5:
5121         mlx5_ib_destroy_srq(devr->s0, NULL);
5122 err_create:
5123         kfree(devr->s0);
5124 error4:
5125         mlx5_ib_dealloc_xrcd(devr->x1, NULL);
5126 error3:
5127         mlx5_ib_dealloc_xrcd(devr->x0, NULL);
5128 error2:
5129         mlx5_ib_destroy_cq(devr->c0, NULL);
5130 err_create_cq:
5131         kfree(devr->c0);
5132 error1:
5133         mlx5_ib_dealloc_pd(devr->p0, NULL);
5134 error0:
5135         kfree(devr->p0);
5136         return ret;
5137 }
5138
5139 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
5140 {
5141         int port;
5142
5143         mlx5_ib_destroy_srq(devr->s1, NULL);
5144         kfree(devr->s1);
5145         mlx5_ib_destroy_srq(devr->s0, NULL);
5146         kfree(devr->s0);
5147         mlx5_ib_dealloc_xrcd(devr->x0, NULL);
5148         mlx5_ib_dealloc_xrcd(devr->x1, NULL);
5149         mlx5_ib_destroy_cq(devr->c0, NULL);
5150         kfree(devr->c0);
5151         mlx5_ib_dealloc_pd(devr->p0, NULL);
5152         kfree(devr->p0);
5153
5154         /* Make sure no change P_Key work items are still executing */
5155         for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
5156                 cancel_work_sync(&devr->ports[port].pkey_change_work);
5157 }
5158
5159 static u32 get_core_cap_flags(struct ib_device *ibdev,
5160                               struct mlx5_hca_vport_context *rep)
5161 {
5162         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5163         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
5164         u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
5165         u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
5166         bool raw_support = !mlx5_core_mp_enabled(dev->mdev);
5167         u32 ret = 0;
5168
5169         if (rep->grh_required)
5170                 ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED;
5171
5172         if (ll == IB_LINK_LAYER_INFINIBAND)
5173                 return ret | RDMA_CORE_PORT_IBA_IB;
5174
5175         if (raw_support)
5176                 ret |= RDMA_CORE_PORT_RAW_PACKET;
5177
5178         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
5179                 return ret;
5180
5181         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
5182                 return ret;
5183
5184         if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
5185                 ret |= RDMA_CORE_PORT_IBA_ROCE;
5186
5187         if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
5188                 ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
5189
5190         return ret;
5191 }
5192
5193 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
5194                                struct ib_port_immutable *immutable)
5195 {
5196         struct ib_port_attr attr;
5197         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5198         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
5199         struct mlx5_hca_vport_context rep = {0};
5200         int err;
5201
5202         err = ib_query_port(ibdev, port_num, &attr);
5203         if (err)
5204                 return err;
5205
5206         if (ll == IB_LINK_LAYER_INFINIBAND) {
5207                 err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0,
5208                                                    &rep);
5209                 if (err)
5210                         return err;
5211         }
5212
5213         immutable->pkey_tbl_len = attr.pkey_tbl_len;
5214         immutable->gid_tbl_len = attr.gid_tbl_len;
5215         immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep);
5216         immutable->max_mad_size = IB_MGMT_MAD_SIZE;
5217
5218         return 0;
5219 }
5220
5221 static int mlx5_port_rep_immutable(struct ib_device *ibdev, u8 port_num,
5222                                    struct ib_port_immutable *immutable)
5223 {
5224         struct ib_port_attr attr;
5225         int err;
5226
5227         immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
5228
5229         err = ib_query_port(ibdev, port_num, &attr);
5230         if (err)
5231                 return err;
5232
5233         immutable->pkey_tbl_len = attr.pkey_tbl_len;
5234         immutable->gid_tbl_len = attr.gid_tbl_len;
5235         immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
5236
5237         return 0;
5238 }
5239
5240 static void get_dev_fw_str(struct ib_device *ibdev, char *str)
5241 {
5242         struct mlx5_ib_dev *dev =
5243                 container_of(ibdev, struct mlx5_ib_dev, ib_dev);
5244         snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d",
5245                  fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev),
5246                  fw_rev_sub(dev->mdev));
5247 }
5248
5249 static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
5250 {
5251         struct mlx5_core_dev *mdev = dev->mdev;
5252         struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
5253                                                                  MLX5_FLOW_NAMESPACE_LAG);
5254         struct mlx5_flow_table *ft;
5255         int err;
5256
5257         if (!ns || !mlx5_lag_is_roce(mdev))
5258                 return 0;
5259
5260         err = mlx5_cmd_create_vport_lag(mdev);
5261         if (err)
5262                 return err;
5263
5264         ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
5265         if (IS_ERR(ft)) {
5266                 err = PTR_ERR(ft);
5267                 goto err_destroy_vport_lag;
5268         }
5269
5270         dev->flow_db->lag_demux_ft = ft;
5271         dev->lag_active = true;
5272         return 0;
5273
5274 err_destroy_vport_lag:
5275         mlx5_cmd_destroy_vport_lag(mdev);
5276         return err;
5277 }
5278
5279 static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
5280 {
5281         struct mlx5_core_dev *mdev = dev->mdev;
5282
5283         if (dev->lag_active) {
5284                 dev->lag_active = false;
5285
5286                 mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
5287                 dev->flow_db->lag_demux_ft = NULL;
5288
5289                 mlx5_cmd_destroy_vport_lag(mdev);
5290         }
5291 }
5292
5293 static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
5294 {
5295         int err;
5296
5297         dev->port[port_num].roce.nb.notifier_call = mlx5_netdev_event;
5298         err = register_netdevice_notifier(&dev->port[port_num].roce.nb);
5299         if (err) {
5300                 dev->port[port_num].roce.nb.notifier_call = NULL;
5301                 return err;
5302         }
5303
5304         return 0;
5305 }
5306
5307 static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
5308 {
5309         if (dev->port[port_num].roce.nb.notifier_call) {
5310                 unregister_netdevice_notifier(&dev->port[port_num].roce.nb);
5311                 dev->port[port_num].roce.nb.notifier_call = NULL;
5312         }
5313 }
5314
5315 static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
5316 {
5317         int err;
5318
5319         err = mlx5_nic_vport_enable_roce(dev->mdev);
5320         if (err)
5321                 return err;
5322
5323         err = mlx5_eth_lag_init(dev);
5324         if (err)
5325                 goto err_disable_roce;
5326
5327         return 0;
5328
5329 err_disable_roce:
5330         mlx5_nic_vport_disable_roce(dev->mdev);
5331
5332         return err;
5333 }
5334
5335 static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
5336 {
5337         mlx5_eth_lag_cleanup(dev);
5338         mlx5_nic_vport_disable_roce(dev->mdev);
5339 }
5340
5341 struct mlx5_ib_counter {
5342         const char *name;
5343         size_t offset;
5344 };
5345
5346 #define INIT_Q_COUNTER(_name)           \
5347         { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
5348
5349 static const struct mlx5_ib_counter basic_q_cnts[] = {
5350         INIT_Q_COUNTER(rx_write_requests),
5351         INIT_Q_COUNTER(rx_read_requests),
5352         INIT_Q_COUNTER(rx_atomic_requests),
5353         INIT_Q_COUNTER(out_of_buffer),
5354 };
5355
5356 static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
5357         INIT_Q_COUNTER(out_of_sequence),
5358 };
5359
5360 static const struct mlx5_ib_counter retrans_q_cnts[] = {
5361         INIT_Q_COUNTER(duplicate_request),
5362         INIT_Q_COUNTER(rnr_nak_retry_err),
5363         INIT_Q_COUNTER(packet_seq_err),
5364         INIT_Q_COUNTER(implied_nak_seq_err),
5365         INIT_Q_COUNTER(local_ack_timeout_err),
5366 };
5367
5368 #define INIT_CONG_COUNTER(_name)                \
5369         { .name = #_name, .offset =     \
5370                 MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
5371
5372 static const struct mlx5_ib_counter cong_cnts[] = {
5373         INIT_CONG_COUNTER(rp_cnp_ignored),
5374         INIT_CONG_COUNTER(rp_cnp_handled),
5375         INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
5376         INIT_CONG_COUNTER(np_cnp_sent),
5377 };
5378
5379 static const struct mlx5_ib_counter extended_err_cnts[] = {
5380         INIT_Q_COUNTER(resp_local_length_error),
5381         INIT_Q_COUNTER(resp_cqe_error),
5382         INIT_Q_COUNTER(req_cqe_error),
5383         INIT_Q_COUNTER(req_remote_invalid_request),
5384         INIT_Q_COUNTER(req_remote_access_errors),
5385         INIT_Q_COUNTER(resp_remote_access_errors),
5386         INIT_Q_COUNTER(resp_cqe_flush_error),
5387         INIT_Q_COUNTER(req_cqe_flush_error),
5388 };
5389
5390 static const struct mlx5_ib_counter roce_accl_cnts[] = {
5391         INIT_Q_COUNTER(roce_adp_retrans),
5392         INIT_Q_COUNTER(roce_adp_retrans_to),
5393         INIT_Q_COUNTER(roce_slow_restart),
5394         INIT_Q_COUNTER(roce_slow_restart_cnps),
5395         INIT_Q_COUNTER(roce_slow_restart_trans),
5396 };
5397
5398 #define INIT_EXT_PPCNT_COUNTER(_name)           \
5399         { .name = #_name, .offset =     \
5400         MLX5_BYTE_OFF(ppcnt_reg, \
5401                       counter_set.eth_extended_cntrs_grp_data_layout._name##_high)}
5402
5403 static const struct mlx5_ib_counter ext_ppcnt_cnts[] = {
5404         INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
5405 };
5406
5407 static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev)
5408 {
5409         return MLX5_ESWITCH_MANAGER(mdev) &&
5410                mlx5_ib_eswitch_mode(mdev->priv.eswitch) ==
5411                        MLX5_ESWITCH_OFFLOADS;
5412 }
5413
5414 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
5415 {
5416         int num_cnt_ports;
5417         int i;
5418
5419         num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
5420
5421         for (i = 0; i < num_cnt_ports; i++) {
5422                 if (dev->port[i].cnts.set_id_valid)
5423                         mlx5_core_dealloc_q_counter(dev->mdev,
5424                                                     dev->port[i].cnts.set_id);
5425                 kfree(dev->port[i].cnts.names);
5426                 kfree(dev->port[i].cnts.offsets);
5427         }
5428 }
5429
5430 static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
5431                                     struct mlx5_ib_counters *cnts)
5432 {
5433         u32 num_counters;
5434
5435         num_counters = ARRAY_SIZE(basic_q_cnts);
5436
5437         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
5438                 num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
5439
5440         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
5441                 num_counters += ARRAY_SIZE(retrans_q_cnts);
5442
5443         if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
5444                 num_counters += ARRAY_SIZE(extended_err_cnts);
5445
5446         if (MLX5_CAP_GEN(dev->mdev, roce_accl))
5447                 num_counters += ARRAY_SIZE(roce_accl_cnts);
5448
5449         cnts->num_q_counters = num_counters;
5450
5451         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5452                 cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
5453                 num_counters += ARRAY_SIZE(cong_cnts);
5454         }
5455         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5456                 cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts);
5457                 num_counters += ARRAY_SIZE(ext_ppcnt_cnts);
5458         }
5459         cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL);
5460         if (!cnts->names)
5461                 return -ENOMEM;
5462
5463         cnts->offsets = kcalloc(num_counters,
5464                                 sizeof(cnts->offsets), GFP_KERNEL);
5465         if (!cnts->offsets)
5466                 goto err_names;
5467
5468         return 0;
5469
5470 err_names:
5471         kfree(cnts->names);
5472         cnts->names = NULL;
5473         return -ENOMEM;
5474 }
5475
5476 static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
5477                                   const char **names,
5478                                   size_t *offsets)
5479 {
5480         int i;
5481         int j = 0;
5482
5483         for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
5484                 names[j] = basic_q_cnts[i].name;
5485                 offsets[j] = basic_q_cnts[i].offset;
5486         }
5487
5488         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
5489                 for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
5490                         names[j] = out_of_seq_q_cnts[i].name;
5491                         offsets[j] = out_of_seq_q_cnts[i].offset;
5492                 }
5493         }
5494
5495         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
5496                 for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
5497                         names[j] = retrans_q_cnts[i].name;
5498                         offsets[j] = retrans_q_cnts[i].offset;
5499                 }
5500         }
5501
5502         if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
5503                 for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
5504                         names[j] = extended_err_cnts[i].name;
5505                         offsets[j] = extended_err_cnts[i].offset;
5506                 }
5507         }
5508
5509         if (MLX5_CAP_GEN(dev->mdev, roce_accl)) {
5510                 for (i = 0; i < ARRAY_SIZE(roce_accl_cnts); i++, j++) {
5511                         names[j] = roce_accl_cnts[i].name;
5512                         offsets[j] = roce_accl_cnts[i].offset;
5513                 }
5514         }
5515
5516         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5517                 for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
5518                         names[j] = cong_cnts[i].name;
5519                         offsets[j] = cong_cnts[i].offset;
5520                 }
5521         }
5522
5523         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5524                 for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) {
5525                         names[j] = ext_ppcnt_cnts[i].name;
5526                         offsets[j] = ext_ppcnt_cnts[i].offset;
5527                 }
5528         }
5529 }
5530
5531 static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
5532 {
5533         int num_cnt_ports;
5534         int err = 0;
5535         int i;
5536         bool is_shared;
5537
5538         is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
5539         num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
5540
5541         for (i = 0; i < num_cnt_ports; i++) {
5542                 err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
5543                 if (err)
5544                         goto err_alloc;
5545
5546                 mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
5547                                       dev->port[i].cnts.offsets);
5548
5549                 err = mlx5_cmd_alloc_q_counter(dev->mdev,
5550                                                &dev->port[i].cnts.set_id,
5551                                                is_shared ?
5552                                                MLX5_SHARED_RESOURCE_UID : 0);
5553                 if (err) {
5554                         mlx5_ib_warn(dev,
5555                                      "couldn't allocate queue counter for port %d, err %d\n",
5556                                      i + 1, err);
5557                         goto err_alloc;
5558                 }
5559                 dev->port[i].cnts.set_id_valid = true;
5560         }
5561         return 0;
5562
5563 err_alloc:
5564         mlx5_ib_dealloc_counters(dev);
5565         return err;
5566 }
5567
5568 static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
5569                                                    u8 port_num)
5570 {
5571         return is_mdev_switchdev_mode(dev->mdev) ? &dev->port[0].cnts :
5572                                                    &dev->port[port_num].cnts;
5573 }
5574
5575 /**
5576  * mlx5_ib_get_counters_id - Returns counters id to use for device+port
5577  * @dev:        Pointer to mlx5 IB device
5578  * @port_num:   Zero based port number
5579  *
5580  * mlx5_ib_get_counters_id() Returns counters set id to use for given
5581  * device port combination in switchdev and non switchdev mode of the
5582  * parent device.
5583  */
5584 u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num)
5585 {
5586         const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
5587
5588         return cnts->set_id;
5589 }
5590
5591 static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
5592                                                     u8 port_num)
5593 {
5594         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5595         const struct mlx5_ib_counters *cnts;
5596         bool is_switchdev = is_mdev_switchdev_mode(dev->mdev);
5597
5598         if ((is_switchdev && port_num) || (!is_switchdev && !port_num))
5599                 return NULL;
5600
5601         cnts = get_counters(dev, port_num - 1);
5602
5603         return rdma_alloc_hw_stats_struct(cnts->names,
5604                                           cnts->num_q_counters +
5605                                           cnts->num_cong_counters +
5606                                           cnts->num_ext_ppcnt_counters,
5607                                           RDMA_HW_STATS_DEFAULT_LIFESPAN);
5608 }
5609
5610 static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
5611                                     const struct mlx5_ib_counters *cnts,
5612                                     struct rdma_hw_stats *stats,
5613                                     u16 set_id)
5614 {
5615         int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
5616         void *out;
5617         __be32 val;
5618         int ret, i;
5619
5620         out = kvzalloc(outlen, GFP_KERNEL);
5621         if (!out)
5622                 return -ENOMEM;
5623
5624         ret = mlx5_core_query_q_counter(mdev, set_id, 0, out, outlen);
5625         if (ret)
5626                 goto free;
5627
5628         for (i = 0; i < cnts->num_q_counters; i++) {
5629                 val = *(__be32 *)(out + cnts->offsets[i]);
5630                 stats->value[i] = (u64)be32_to_cpu(val);
5631         }
5632
5633 free:
5634         kvfree(out);
5635         return ret;
5636 }
5637
5638 static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
5639                                             const struct mlx5_ib_counters *cnts,
5640                                             struct rdma_hw_stats *stats)
5641 {
5642         int offset = cnts->num_q_counters + cnts->num_cong_counters;
5643         int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
5644         int ret, i;
5645         void *out;
5646
5647         out = kvzalloc(sz, GFP_KERNEL);
5648         if (!out)
5649                 return -ENOMEM;
5650
5651         ret = mlx5_cmd_query_ext_ppcnt_counters(dev->mdev, out);
5652         if (ret)
5653                 goto free;
5654
5655         for (i = 0; i < cnts->num_ext_ppcnt_counters; i++)
5656                 stats->value[i + offset] =
5657                         be64_to_cpup((__be64 *)(out +
5658                                     cnts->offsets[i + offset]));
5659 free:
5660         kvfree(out);
5661         return ret;
5662 }
5663
5664 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
5665                                 struct rdma_hw_stats *stats,
5666                                 u8 port_num, int index)
5667 {
5668         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5669         const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1);
5670         struct mlx5_core_dev *mdev;
5671         int ret, num_counters;
5672         u8 mdev_port_num;
5673
5674         if (!stats)
5675                 return -EINVAL;
5676
5677         num_counters = cnts->num_q_counters +
5678                        cnts->num_cong_counters +
5679                        cnts->num_ext_ppcnt_counters;
5680
5681         /* q_counters are per IB device, query the master mdev */
5682         ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats, cnts->set_id);
5683         if (ret)
5684                 return ret;
5685
5686         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5687                 ret =  mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats);
5688                 if (ret)
5689                         return ret;
5690         }
5691
5692         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5693                 mdev = mlx5_ib_get_native_port_mdev(dev, port_num,
5694                                                     &mdev_port_num);
5695                 if (!mdev) {
5696                         /* If port is not affiliated yet, its in down state
5697                          * which doesn't have any counters yet, so it would be
5698                          * zero. So no need to read from the HCA.
5699                          */
5700                         goto done;
5701                 }
5702                 ret = mlx5_lag_query_cong_counters(dev->mdev,
5703                                                    stats->value +
5704                                                    cnts->num_q_counters,
5705                                                    cnts->num_cong_counters,
5706                                                    cnts->offsets +
5707                                                    cnts->num_q_counters);
5708
5709                 mlx5_ib_put_native_port_mdev(dev, port_num);
5710                 if (ret)
5711                         return ret;
5712         }
5713
5714 done:
5715         return num_counters;
5716 }
5717
5718 static struct rdma_hw_stats *
5719 mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
5720 {
5721         struct mlx5_ib_dev *dev = to_mdev(counter->device);
5722         const struct mlx5_ib_counters *cnts =
5723                 get_counters(dev, counter->port - 1);
5724
5725         /* Q counters are in the beginning of all counters */
5726         return rdma_alloc_hw_stats_struct(cnts->names,
5727                                           cnts->num_q_counters,
5728                                           RDMA_HW_STATS_DEFAULT_LIFESPAN);
5729 }
5730
5731 static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
5732 {
5733         struct mlx5_ib_dev *dev = to_mdev(counter->device);
5734         const struct mlx5_ib_counters *cnts =
5735                 get_counters(dev, counter->port - 1);
5736
5737         return mlx5_ib_query_q_counters(dev->mdev, cnts,
5738                                         counter->stats, counter->id);
5739 }
5740
5741 static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
5742                                    struct ib_qp *qp)
5743 {
5744         struct mlx5_ib_dev *dev = to_mdev(qp->device);
5745         u16 cnt_set_id = 0;
5746         int err;
5747
5748         if (!counter->id) {
5749                 err = mlx5_cmd_alloc_q_counter(dev->mdev,
5750                                                &cnt_set_id,
5751                                                MLX5_SHARED_RESOURCE_UID);
5752                 if (err)
5753                         return err;
5754                 counter->id = cnt_set_id;
5755         }
5756
5757         err = mlx5_ib_qp_set_counter(qp, counter);
5758         if (err)
5759                 goto fail_set_counter;
5760
5761         return 0;
5762
5763 fail_set_counter:
5764         mlx5_core_dealloc_q_counter(dev->mdev, cnt_set_id);
5765         counter->id = 0;
5766
5767         return err;
5768 }
5769
5770 static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
5771 {
5772         return mlx5_ib_qp_set_counter(qp, NULL);
5773 }
5774
5775 static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
5776 {
5777         struct mlx5_ib_dev *dev = to_mdev(counter->device);
5778
5779         return mlx5_core_dealloc_q_counter(dev->mdev, counter->id);
5780 }
5781
5782 static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
5783                                  enum rdma_netdev_t type,
5784                                  struct rdma_netdev_alloc_params *params)
5785 {
5786         if (type != RDMA_NETDEV_IPOIB)
5787                 return -EOPNOTSUPP;
5788
5789         return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params);
5790 }
5791
5792 static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev)
5793 {
5794         if (!dev->delay_drop.dir_debugfs)
5795                 return;
5796         debugfs_remove_recursive(dev->delay_drop.dir_debugfs);
5797         dev->delay_drop.dir_debugfs = NULL;
5798 }
5799
5800 static void cancel_delay_drop(struct mlx5_ib_dev *dev)
5801 {
5802         if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
5803                 return;
5804
5805         cancel_work_sync(&dev->delay_drop.delay_drop_work);
5806         delay_drop_debugfs_cleanup(dev);
5807 }
5808
5809 static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf,
5810                                        size_t count, loff_t *pos)
5811 {
5812         struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
5813         char lbuf[20];
5814         int len;
5815
5816         len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout);
5817         return simple_read_from_buffer(buf, count, pos, lbuf, len);
5818 }
5819
5820 static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf,
5821                                         size_t count, loff_t *pos)
5822 {
5823         struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
5824         u32 timeout;
5825         u32 var;
5826
5827         if (kstrtouint_from_user(buf, count, 0, &var))
5828                 return -EFAULT;
5829
5830         timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS *
5831                         1000);
5832         if (timeout != var)
5833                 mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n",
5834                             timeout);
5835
5836         delay_drop->timeout = timeout;
5837
5838         return count;
5839 }
5840
5841 static const struct file_operations fops_delay_drop_timeout = {
5842         .owner  = THIS_MODULE,
5843         .open   = simple_open,
5844         .write  = delay_drop_timeout_write,
5845         .read   = delay_drop_timeout_read,
5846 };
5847
5848 static void delay_drop_debugfs_init(struct mlx5_ib_dev *dev)
5849 {
5850         struct dentry *root;
5851
5852         if (!mlx5_debugfs_root)
5853                 return;
5854
5855         root = debugfs_create_dir("delay_drop", dev->mdev->priv.dbg_root);
5856         dev->delay_drop.dir_debugfs = root;
5857
5858         debugfs_create_atomic_t("num_timeout_events", 0400, root,
5859                                 &dev->delay_drop.events_cnt);
5860         debugfs_create_atomic_t("num_rqs", 0400, root,
5861                                 &dev->delay_drop.rqs_cnt);
5862         debugfs_create_file("timeout", 0600, root, &dev->delay_drop,
5863                             &fops_delay_drop_timeout);
5864 }
5865
5866 static void init_delay_drop(struct mlx5_ib_dev *dev)
5867 {
5868         if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
5869                 return;
5870
5871         mutex_init(&dev->delay_drop.lock);
5872         dev->delay_drop.dev = dev;
5873         dev->delay_drop.activate = false;
5874         dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
5875         INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
5876         atomic_set(&dev->delay_drop.rqs_cnt, 0);
5877         atomic_set(&dev->delay_drop.events_cnt, 0);
5878
5879         delay_drop_debugfs_init(dev);
5880 }
5881
5882 static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
5883                                       struct mlx5_ib_multiport_info *mpi)
5884 {
5885         u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
5886         struct mlx5_ib_port *port = &ibdev->port[port_num];
5887         int comps;
5888         int err;
5889         int i;
5890
5891         lockdep_assert_held(&mlx5_ib_multiport_mutex);
5892
5893         mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
5894
5895         spin_lock(&port->mp.mpi_lock);
5896         if (!mpi->ibdev) {
5897                 spin_unlock(&port->mp.mpi_lock);
5898                 return;
5899         }
5900
5901         mpi->ibdev = NULL;
5902
5903         spin_unlock(&port->mp.mpi_lock);
5904         if (mpi->mdev_events.notifier_call)
5905                 mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
5906         mpi->mdev_events.notifier_call = NULL;
5907         mlx5_remove_netdev_notifier(ibdev, port_num);
5908         spin_lock(&port->mp.mpi_lock);
5909
5910         comps = mpi->mdev_refcnt;
5911         if (comps) {
5912                 mpi->unaffiliate = true;
5913                 init_completion(&mpi->unref_comp);
5914                 spin_unlock(&port->mp.mpi_lock);
5915
5916                 for (i = 0; i < comps; i++)
5917                         wait_for_completion(&mpi->unref_comp);
5918
5919                 spin_lock(&port->mp.mpi_lock);
5920                 mpi->unaffiliate = false;
5921         }
5922
5923         port->mp.mpi = NULL;
5924
5925         list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
5926
5927         spin_unlock(&port->mp.mpi_lock);
5928
5929         err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev);
5930
5931         mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1);
5932         /* Log an error, still needed to cleanup the pointers and add
5933          * it back to the list.
5934          */
5935         if (err)
5936                 mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n",
5937                             port_num + 1);
5938
5939         ibdev->port[port_num].roce.last_port_state = IB_PORT_DOWN;
5940 }
5941
5942 static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
5943                                     struct mlx5_ib_multiport_info *mpi)
5944 {
5945         u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
5946         int err;
5947
5948         lockdep_assert_held(&mlx5_ib_multiport_mutex);
5949
5950         spin_lock(&ibdev->port[port_num].mp.mpi_lock);
5951         if (ibdev->port[port_num].mp.mpi) {
5952                 mlx5_ib_dbg(ibdev, "port %d already affiliated.\n",
5953                             port_num + 1);
5954                 spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
5955                 return false;
5956         }
5957
5958         ibdev->port[port_num].mp.mpi = mpi;
5959         mpi->ibdev = ibdev;
5960         mpi->mdev_events.notifier_call = NULL;
5961         spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
5962
5963         err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev);
5964         if (err)
5965                 goto unbind;
5966
5967         err = get_port_caps(ibdev, mlx5_core_native_port_num(mpi->mdev));
5968         if (err)
5969                 goto unbind;
5970
5971         err = mlx5_add_netdev_notifier(ibdev, port_num);
5972         if (err) {
5973                 mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n",
5974                             port_num + 1);
5975                 goto unbind;
5976         }
5977
5978         mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
5979         mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
5980
5981         mlx5_ib_init_cong_debugfs(ibdev, port_num);
5982
5983         return true;
5984
5985 unbind:
5986         mlx5_ib_unbind_slave_port(ibdev, mpi);
5987         return false;
5988 }
5989
5990 static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
5991 {
5992         int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
5993         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
5994                                                           port_num + 1);
5995         struct mlx5_ib_multiport_info *mpi;
5996         int err;
5997         int i;
5998
5999         if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
6000                 return 0;
6001
6002         err = mlx5_query_nic_vport_system_image_guid(dev->mdev,
6003                                                      &dev->sys_image_guid);
6004         if (err)
6005                 return err;
6006
6007         err = mlx5_nic_vport_enable_roce(dev->mdev);
6008         if (err)
6009                 return err;
6010
6011         mutex_lock(&mlx5_ib_multiport_mutex);
6012         for (i = 0; i < dev->num_ports; i++) {
6013                 bool bound = false;
6014
6015                 /* build a stub multiport info struct for the native port. */
6016                 if (i == port_num) {
6017                         mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
6018                         if (!mpi) {
6019                                 mutex_unlock(&mlx5_ib_multiport_mutex);
6020                                 mlx5_nic_vport_disable_roce(dev->mdev);
6021                                 return -ENOMEM;
6022                         }
6023
6024                         mpi->is_master = true;
6025                         mpi->mdev = dev->mdev;
6026                         mpi->sys_image_guid = dev->sys_image_guid;
6027                         dev->port[i].mp.mpi = mpi;
6028                         mpi->ibdev = dev;
6029                         mpi = NULL;
6030                         continue;
6031                 }
6032
6033                 list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list,
6034                                     list) {
6035                         if (dev->sys_image_guid == mpi->sys_image_guid &&
6036                             (mlx5_core_native_port_num(mpi->mdev) - 1) == i) {
6037                                 bound = mlx5_ib_bind_slave_port(dev, mpi);
6038                         }
6039
6040                         if (bound) {
6041                                 dev_dbg(mpi->mdev->device,
6042                                         "removing port from unaffiliated list.\n");
6043                                 mlx5_ib_dbg(dev, "port %d bound\n", i + 1);
6044                                 list_del(&mpi->list);
6045                                 break;
6046                         }
6047                 }
6048                 if (!bound) {
6049                         get_port_caps(dev, i + 1);
6050                         mlx5_ib_dbg(dev, "no free port found for port %d\n",
6051                                     i + 1);
6052                 }
6053         }
6054
6055         list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list);
6056         mutex_unlock(&mlx5_ib_multiport_mutex);
6057         return err;
6058 }
6059
6060 static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
6061 {
6062         int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6063         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
6064                                                           port_num + 1);
6065         int i;
6066
6067         if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
6068                 return;
6069
6070         mutex_lock(&mlx5_ib_multiport_mutex);
6071         for (i = 0; i < dev->num_ports; i++) {
6072                 if (dev->port[i].mp.mpi) {
6073                         /* Destroy the native port stub */
6074                         if (i == port_num) {
6075                                 kfree(dev->port[i].mp.mpi);
6076                                 dev->port[i].mp.mpi = NULL;
6077                         } else {
6078                                 mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1);
6079                                 mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi);
6080                         }
6081                 }
6082         }
6083
6084         mlx5_ib_dbg(dev, "removing from devlist\n");
6085         list_del(&dev->ib_dev_list);
6086         mutex_unlock(&mlx5_ib_multiport_mutex);
6087
6088         mlx5_nic_vport_disable_roce(dev->mdev);
6089 }
6090
6091 static int var_obj_cleanup(struct ib_uobject *uobject,
6092                            enum rdma_remove_reason why,
6093                            struct uverbs_attr_bundle *attrs)
6094 {
6095         struct mlx5_user_mmap_entry *obj = uobject->object;
6096
6097         rdma_user_mmap_entry_remove(&obj->rdma_entry);
6098         return 0;
6099 }
6100
6101 static struct mlx5_user_mmap_entry *
6102 alloc_var_entry(struct mlx5_ib_ucontext *c)
6103 {
6104         struct mlx5_user_mmap_entry *entry;
6105         struct mlx5_var_table *var_table;
6106         u32 page_idx;
6107         int err;
6108
6109         var_table = &to_mdev(c->ibucontext.device)->var_table;
6110         entry = kzalloc(sizeof(*entry), GFP_KERNEL);
6111         if (!entry)
6112                 return ERR_PTR(-ENOMEM);
6113
6114         mutex_lock(&var_table->bitmap_lock);
6115         page_idx = find_first_zero_bit(var_table->bitmap,
6116                                        var_table->num_var_hw_entries);
6117         if (page_idx >= var_table->num_var_hw_entries) {
6118                 err = -ENOSPC;
6119                 mutex_unlock(&var_table->bitmap_lock);
6120                 goto end;
6121         }
6122
6123         set_bit(page_idx, var_table->bitmap);
6124         mutex_unlock(&var_table->bitmap_lock);
6125
6126         entry->address = var_table->hw_start_addr +
6127                                 (page_idx * var_table->stride_size);
6128         entry->page_idx = page_idx;
6129         entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR;
6130
6131         err = rdma_user_mmap_entry_insert_range(
6132                 &c->ibucontext, &entry->rdma_entry, var_table->stride_size,
6133                 MLX5_IB_MMAP_OFFSET_START << 16,
6134                 (MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1);
6135         if (err)
6136                 goto err_insert;
6137
6138         return entry;
6139
6140 err_insert:
6141         mutex_lock(&var_table->bitmap_lock);
6142         clear_bit(page_idx, var_table->bitmap);
6143         mutex_unlock(&var_table->bitmap_lock);
6144 end:
6145         kfree(entry);
6146         return ERR_PTR(err);
6147 }
6148
6149 static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)(
6150         struct uverbs_attr_bundle *attrs)
6151 {
6152         struct ib_uobject *uobj = uverbs_attr_get_uobject(
6153                 attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE);
6154         struct mlx5_ib_ucontext *c;
6155         struct mlx5_user_mmap_entry *entry;
6156         u64 mmap_offset;
6157         u32 length;
6158         int err;
6159
6160         c = to_mucontext(ib_uverbs_get_ucontext(attrs));
6161         if (IS_ERR(c))
6162                 return PTR_ERR(c);
6163
6164         entry = alloc_var_entry(c);
6165         if (IS_ERR(entry))
6166                 return PTR_ERR(entry);
6167
6168         mmap_offset = mlx5_entry_to_mmap_offset(entry);
6169         length = entry->rdma_entry.npages * PAGE_SIZE;
6170         uobj->object = entry;
6171
6172         err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
6173                              &mmap_offset, sizeof(mmap_offset));
6174         if (err)
6175                 goto err;
6176
6177         err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
6178                              &entry->page_idx, sizeof(entry->page_idx));
6179         if (err)
6180                 goto err;
6181
6182         err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
6183                              &length, sizeof(length));
6184         if (err)
6185                 goto err;
6186
6187         return 0;
6188
6189 err:
6190         rdma_user_mmap_entry_remove(&entry->rdma_entry);
6191         return err;
6192 }
6193
6194 DECLARE_UVERBS_NAMED_METHOD(
6195         MLX5_IB_METHOD_VAR_OBJ_ALLOC,
6196         UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE,
6197                         MLX5_IB_OBJECT_VAR,
6198                         UVERBS_ACCESS_NEW,
6199                         UA_MANDATORY),
6200         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
6201                            UVERBS_ATTR_TYPE(u32),
6202                            UA_MANDATORY),
6203         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
6204                            UVERBS_ATTR_TYPE(u32),
6205                            UA_MANDATORY),
6206         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
6207                             UVERBS_ATTR_TYPE(u64),
6208                             UA_MANDATORY));
6209
6210 DECLARE_UVERBS_NAMED_METHOD_DESTROY(
6211         MLX5_IB_METHOD_VAR_OBJ_DESTROY,
6212         UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_DESTROY_HANDLE,
6213                         MLX5_IB_OBJECT_VAR,
6214                         UVERBS_ACCESS_DESTROY,
6215                         UA_MANDATORY));
6216
6217 DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR,
6218                             UVERBS_TYPE_ALLOC_IDR(var_obj_cleanup),
6219                             &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC),
6220                             &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY));
6221
6222 static bool var_is_supported(struct ib_device *device)
6223 {
6224         struct mlx5_ib_dev *dev = to_mdev(device);
6225
6226         return (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
6227                         MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q);
6228 }
6229
6230 ADD_UVERBS_ATTRIBUTES_SIMPLE(
6231         mlx5_ib_dm,
6232         UVERBS_OBJECT_DM,
6233         UVERBS_METHOD_DM_ALLOC,
6234         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
6235                             UVERBS_ATTR_TYPE(u64),
6236                             UA_MANDATORY),
6237         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
6238                             UVERBS_ATTR_TYPE(u16),
6239                             UA_OPTIONAL),
6240         UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
6241                              enum mlx5_ib_uapi_dm_type,
6242                              UA_OPTIONAL));
6243
6244 ADD_UVERBS_ATTRIBUTES_SIMPLE(
6245         mlx5_ib_flow_action,
6246         UVERBS_OBJECT_FLOW_ACTION,
6247         UVERBS_METHOD_FLOW_ACTION_ESP_CREATE,
6248         UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
6249                              enum mlx5_ib_uapi_flow_action_flags));
6250
6251 static const struct uapi_definition mlx5_ib_defs[] = {
6252         UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
6253         UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
6254
6255         UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
6256                                 &mlx5_ib_flow_action),
6257         UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
6258         UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
6259                                 UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
6260         {}
6261 };
6262
6263 static int mlx5_ib_read_counters(struct ib_counters *counters,
6264                                  struct ib_counters_read_attr *read_attr,
6265                                  struct uverbs_attr_bundle *attrs)
6266 {
6267         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
6268         struct mlx5_read_counters_attr mread_attr = {};
6269         struct mlx5_ib_flow_counters_desc *desc;
6270         int ret, i;
6271
6272         mutex_lock(&mcounters->mcntrs_mutex);
6273         if (mcounters->cntrs_max_index > read_attr->ncounters) {
6274                 ret = -EINVAL;
6275                 goto err_bound;
6276         }
6277
6278         mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64),
6279                                  GFP_KERNEL);
6280         if (!mread_attr.out) {
6281                 ret = -ENOMEM;
6282                 goto err_bound;
6283         }
6284
6285         mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl;
6286         mread_attr.flags = read_attr->flags;
6287         ret = mcounters->read_counters(counters->device, &mread_attr);
6288         if (ret)
6289                 goto err_read;
6290
6291         /* do the pass over the counters data array to assign according to the
6292          * descriptions and indexing pairs
6293          */
6294         desc = mcounters->counters_data;
6295         for (i = 0; i < mcounters->ncounters; i++)
6296                 read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description];
6297
6298 err_read:
6299         kfree(mread_attr.out);
6300 err_bound:
6301         mutex_unlock(&mcounters->mcntrs_mutex);
6302         return ret;
6303 }
6304
6305 static int mlx5_ib_destroy_counters(struct ib_counters *counters)
6306 {
6307         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
6308
6309         counters_clear_description(counters);
6310         if (mcounters->hw_cntrs_hndl)
6311                 mlx5_fc_destroy(to_mdev(counters->device)->mdev,
6312                                 mcounters->hw_cntrs_hndl);
6313
6314         kfree(mcounters);
6315
6316         return 0;
6317 }
6318
6319 static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
6320                                                    struct uverbs_attr_bundle *attrs)
6321 {
6322         struct mlx5_ib_mcounters *mcounters;
6323
6324         mcounters = kzalloc(sizeof(*mcounters), GFP_KERNEL);
6325         if (!mcounters)
6326                 return ERR_PTR(-ENOMEM);
6327
6328         mutex_init(&mcounters->mcntrs_mutex);
6329
6330         return &mcounters->ibcntrs;
6331 }
6332
6333 static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
6334 {
6335         mlx5_ib_cleanup_multiport_master(dev);
6336         WARN_ON(!xa_empty(&dev->odp_mkeys));
6337         cleanup_srcu_struct(&dev->odp_srcu);
6338
6339         WARN_ON(!xa_empty(&dev->sig_mrs));
6340         WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
6341 }
6342
6343 static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
6344 {
6345         struct mlx5_core_dev *mdev = dev->mdev;
6346         int err;
6347         int i;
6348
6349         for (i = 0; i < dev->num_ports; i++) {
6350                 spin_lock_init(&dev->port[i].mp.mpi_lock);
6351                 rwlock_init(&dev->port[i].roce.netdev_lock);
6352                 dev->port[i].roce.dev = dev;
6353                 dev->port[i].roce.native_port_num = i + 1;
6354                 dev->port[i].roce.last_port_state = IB_PORT_DOWN;
6355         }
6356
6357         mlx5_ib_internal_fill_odp_caps(dev);
6358
6359         err = mlx5_ib_init_multiport_master(dev);
6360         if (err)
6361                 return err;
6362
6363         err = set_has_smi_cap(dev);
6364         if (err)
6365                 return err;
6366
6367         if (!mlx5_core_mp_enabled(mdev)) {
6368                 for (i = 1; i <= dev->num_ports; i++) {
6369                         err = get_port_caps(dev, i);
6370                         if (err)
6371                                 break;
6372                 }
6373         } else {
6374                 err = get_port_caps(dev, mlx5_core_native_port_num(mdev));
6375         }
6376         if (err)
6377                 goto err_mp;
6378
6379         if (mlx5_use_mad_ifc(dev))
6380                 get_ext_port_caps(dev);
6381
6382         dev->ib_dev.node_type           = RDMA_NODE_IB_CA;
6383         dev->ib_dev.local_dma_lkey      = 0 /* not supported for now */;
6384         dev->ib_dev.phys_port_cnt       = dev->num_ports;
6385         dev->ib_dev.num_comp_vectors    = mlx5_comp_vectors_count(mdev);
6386         dev->ib_dev.dev.parent          = mdev->device;
6387
6388         mutex_init(&dev->cap_mask_mutex);
6389         INIT_LIST_HEAD(&dev->qp_list);
6390         spin_lock_init(&dev->reset_flow_resource_lock);
6391         xa_init(&dev->odp_mkeys);
6392         xa_init(&dev->sig_mrs);
6393
6394         spin_lock_init(&dev->dm.lock);
6395         dev->dm.dev = mdev;
6396
6397         err = init_srcu_struct(&dev->odp_srcu);
6398         if (err)
6399                 goto err_mp;
6400
6401         return 0;
6402
6403 err_mp:
6404         mlx5_ib_cleanup_multiport_master(dev);
6405
6406         return -ENOMEM;
6407 }
6408
6409 static int mlx5_ib_stage_flow_db_init(struct mlx5_ib_dev *dev)
6410 {
6411         dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
6412
6413         if (!dev->flow_db)
6414                 return -ENOMEM;
6415
6416         mutex_init(&dev->flow_db->lock);
6417
6418         return 0;
6419 }
6420
6421 static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
6422 {
6423         kfree(dev->flow_db);
6424 }
6425
6426 static const struct ib_device_ops mlx5_ib_dev_ops = {
6427         .owner = THIS_MODULE,
6428         .driver_id = RDMA_DRIVER_MLX5,
6429         .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION,
6430
6431         .add_gid = mlx5_ib_add_gid,
6432         .alloc_mr = mlx5_ib_alloc_mr,
6433         .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
6434         .alloc_pd = mlx5_ib_alloc_pd,
6435         .alloc_ucontext = mlx5_ib_alloc_ucontext,
6436         .attach_mcast = mlx5_ib_mcg_attach,
6437         .check_mr_status = mlx5_ib_check_mr_status,
6438         .create_ah = mlx5_ib_create_ah,
6439         .create_counters = mlx5_ib_create_counters,
6440         .create_cq = mlx5_ib_create_cq,
6441         .create_flow = mlx5_ib_create_flow,
6442         .create_qp = mlx5_ib_create_qp,
6443         .create_srq = mlx5_ib_create_srq,
6444         .dealloc_pd = mlx5_ib_dealloc_pd,
6445         .dealloc_ucontext = mlx5_ib_dealloc_ucontext,
6446         .del_gid = mlx5_ib_del_gid,
6447         .dereg_mr = mlx5_ib_dereg_mr,
6448         .destroy_ah = mlx5_ib_destroy_ah,
6449         .destroy_counters = mlx5_ib_destroy_counters,
6450         .destroy_cq = mlx5_ib_destroy_cq,
6451         .destroy_flow = mlx5_ib_destroy_flow,
6452         .destroy_flow_action = mlx5_ib_destroy_flow_action,
6453         .destroy_qp = mlx5_ib_destroy_qp,
6454         .destroy_srq = mlx5_ib_destroy_srq,
6455         .detach_mcast = mlx5_ib_mcg_detach,
6456         .disassociate_ucontext = mlx5_ib_disassociate_ucontext,
6457         .drain_rq = mlx5_ib_drain_rq,
6458         .drain_sq = mlx5_ib_drain_sq,
6459         .enable_driver = mlx5_ib_enable_driver,
6460         .fill_res_entry = mlx5_ib_fill_res_entry,
6461         .fill_stat_entry = mlx5_ib_fill_stat_entry,
6462         .get_dev_fw_str = get_dev_fw_str,
6463         .get_dma_mr = mlx5_ib_get_dma_mr,
6464         .get_link_layer = mlx5_ib_port_link_layer,
6465         .map_mr_sg = mlx5_ib_map_mr_sg,
6466         .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
6467         .mmap = mlx5_ib_mmap,
6468         .mmap_free = mlx5_ib_mmap_free,
6469         .modify_cq = mlx5_ib_modify_cq,
6470         .modify_device = mlx5_ib_modify_device,
6471         .modify_port = mlx5_ib_modify_port,
6472         .modify_qp = mlx5_ib_modify_qp,
6473         .modify_srq = mlx5_ib_modify_srq,
6474         .poll_cq = mlx5_ib_poll_cq,
6475         .post_recv = mlx5_ib_post_recv,
6476         .post_send = mlx5_ib_post_send,
6477         .post_srq_recv = mlx5_ib_post_srq_recv,
6478         .process_mad = mlx5_ib_process_mad,
6479         .query_ah = mlx5_ib_query_ah,
6480         .query_device = mlx5_ib_query_device,
6481         .query_gid = mlx5_ib_query_gid,
6482         .query_pkey = mlx5_ib_query_pkey,
6483         .query_qp = mlx5_ib_query_qp,
6484         .query_srq = mlx5_ib_query_srq,
6485         .read_counters = mlx5_ib_read_counters,
6486         .reg_user_mr = mlx5_ib_reg_user_mr,
6487         .req_notify_cq = mlx5_ib_arm_cq,
6488         .rereg_user_mr = mlx5_ib_rereg_user_mr,
6489         .resize_cq = mlx5_ib_resize_cq,
6490
6491         INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
6492         INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
6493         INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
6494         INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
6495         INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
6496 };
6497
6498 static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
6499         .create_flow_action_esp = mlx5_ib_create_flow_action_esp,
6500         .modify_flow_action_esp = mlx5_ib_modify_flow_action_esp,
6501 };
6502
6503 static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
6504         .rdma_netdev_get_params = mlx5_ib_rn_get_params,
6505 };
6506
6507 static const struct ib_device_ops mlx5_ib_dev_sriov_ops = {
6508         .get_vf_config = mlx5_ib_get_vf_config,
6509         .get_vf_guid = mlx5_ib_get_vf_guid,
6510         .get_vf_stats = mlx5_ib_get_vf_stats,
6511         .set_vf_guid = mlx5_ib_set_vf_guid,
6512         .set_vf_link_state = mlx5_ib_set_vf_link_state,
6513 };
6514
6515 static const struct ib_device_ops mlx5_ib_dev_mw_ops = {
6516         .alloc_mw = mlx5_ib_alloc_mw,
6517         .dealloc_mw = mlx5_ib_dealloc_mw,
6518 };
6519
6520 static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
6521         .alloc_xrcd = mlx5_ib_alloc_xrcd,
6522         .dealloc_xrcd = mlx5_ib_dealloc_xrcd,
6523 };
6524
6525 static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
6526         .alloc_dm = mlx5_ib_alloc_dm,
6527         .dealloc_dm = mlx5_ib_dealloc_dm,
6528         .reg_dm_mr = mlx5_ib_reg_dm_mr,
6529 };
6530
6531 static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev)
6532 {
6533         struct mlx5_core_dev *mdev = dev->mdev;
6534         struct mlx5_var_table *var_table = &dev->var_table;
6535         u8 log_doorbell_bar_size;
6536         u8 log_doorbell_stride;
6537         u64 bar_size;
6538
6539         log_doorbell_bar_size = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
6540                                         log_doorbell_bar_size);
6541         log_doorbell_stride = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
6542                                         log_doorbell_stride);
6543         var_table->hw_start_addr = dev->mdev->bar_addr +
6544                                 MLX5_CAP64_DEV_VDPA_EMULATION(mdev,
6545                                         doorbell_bar_offset);
6546         bar_size = (1ULL << log_doorbell_bar_size) * 4096;
6547         var_table->stride_size = 1ULL << log_doorbell_stride;
6548         var_table->num_var_hw_entries = div64_u64(bar_size, var_table->stride_size);
6549         mutex_init(&var_table->bitmap_lock);
6550         var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries,
6551                                           GFP_KERNEL);
6552         return (var_table->bitmap) ? 0 : -ENOMEM;
6553 }
6554
6555 static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev)
6556 {
6557         bitmap_free(dev->var_table.bitmap);
6558 }
6559
6560 static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
6561 {
6562         struct mlx5_core_dev *mdev = dev->mdev;
6563         int err;
6564
6565         dev->ib_dev.uverbs_cmd_mask     =
6566                 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
6567                 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
6568                 (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
6569                 (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
6570                 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
6571                 (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
6572                 (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
6573                 (1ull << IB_USER_VERBS_CMD_REG_MR)              |
6574                 (1ull << IB_USER_VERBS_CMD_REREG_MR)            |
6575                 (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
6576                 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
6577                 (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
6578                 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
6579                 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
6580                 (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
6581                 (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
6582                 (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
6583                 (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
6584                 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
6585                 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
6586                 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
6587                 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
6588                 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
6589                 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
6590                 (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)         |
6591                 (1ull << IB_USER_VERBS_CMD_OPEN_QP);
6592         dev->ib_dev.uverbs_ex_cmd_mask =
6593                 (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)     |
6594                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)        |
6595                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP)        |
6596                 (1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP)        |
6597                 (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ)        |
6598                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW)      |
6599                 (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
6600
6601         if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
6602             IS_ENABLED(CONFIG_MLX5_CORE_IPOIB))
6603                 ib_set_device_ops(&dev->ib_dev,
6604                                   &mlx5_ib_dev_ipoib_enhanced_ops);
6605
6606         if (mlx5_core_is_pf(mdev))
6607                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_sriov_ops);
6608
6609         dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
6610
6611         if (MLX5_CAP_GEN(mdev, imaicl)) {
6612                 dev->ib_dev.uverbs_cmd_mask |=
6613                         (1ull << IB_USER_VERBS_CMD_ALLOC_MW)    |
6614                         (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
6615                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_mw_ops);
6616         }
6617
6618         if (MLX5_CAP_GEN(mdev, xrc)) {
6619                 dev->ib_dev.uverbs_cmd_mask |=
6620                         (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
6621                         (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
6622                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops);
6623         }
6624
6625         if (MLX5_CAP_DEV_MEM(mdev, memic) ||
6626             MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
6627             MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)
6628                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
6629
6630         if (mlx5_accel_ipsec_device_caps(dev->mdev) &
6631             MLX5_ACCEL_IPSEC_CAP_DEVICE)
6632                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops);
6633         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
6634
6635         if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
6636                 dev->ib_dev.driver_def = mlx5_ib_defs;
6637
6638         err = init_node_data(dev);
6639         if (err)
6640                 return err;
6641
6642         if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
6643             (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) ||
6644              MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
6645                 mutex_init(&dev->lb.mutex);
6646
6647         if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
6648                         MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) {
6649                 err = mlx5_ib_init_var_table(dev);
6650                 if (err)
6651                         return err;
6652         }
6653
6654         dev->ib_dev.use_cq_dim = true;
6655
6656         return 0;
6657 }
6658
6659 static const struct ib_device_ops mlx5_ib_dev_port_ops = {
6660         .get_port_immutable = mlx5_port_immutable,
6661         .query_port = mlx5_ib_query_port,
6662 };
6663
6664 static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev)
6665 {
6666         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_ops);
6667         return 0;
6668 }
6669
6670 static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = {
6671         .get_port_immutable = mlx5_port_rep_immutable,
6672         .query_port = mlx5_ib_rep_query_port,
6673 };
6674
6675 static int mlx5_ib_stage_raw_eth_non_default_cb(struct mlx5_ib_dev *dev)
6676 {
6677         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
6678         return 0;
6679 }
6680
6681 static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
6682         .create_rwq_ind_table = mlx5_ib_create_rwq_ind_table,
6683         .create_wq = mlx5_ib_create_wq,
6684         .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
6685         .destroy_wq = mlx5_ib_destroy_wq,
6686         .get_netdev = mlx5_ib_get_netdev,
6687         .modify_wq = mlx5_ib_modify_wq,
6688 };
6689
6690 static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
6691 {
6692         u8 port_num;
6693
6694         dev->ib_dev.uverbs_ex_cmd_mask |=
6695                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
6696                         (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
6697                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
6698                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
6699                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
6700         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
6701
6702         port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6703
6704         /* Register only for native ports */
6705         return mlx5_add_netdev_notifier(dev, port_num);
6706 }
6707
6708 static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev)
6709 {
6710         u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6711
6712         mlx5_remove_netdev_notifier(dev, port_num);
6713 }
6714
6715 static int mlx5_ib_stage_raw_eth_roce_init(struct mlx5_ib_dev *dev)
6716 {
6717         struct mlx5_core_dev *mdev = dev->mdev;
6718         enum rdma_link_layer ll;
6719         int port_type_cap;
6720         int err = 0;
6721
6722         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6723         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6724
6725         if (ll == IB_LINK_LAYER_ETHERNET)
6726                 err = mlx5_ib_stage_common_roce_init(dev);
6727
6728         return err;
6729 }
6730
6731 static void mlx5_ib_stage_raw_eth_roce_cleanup(struct mlx5_ib_dev *dev)
6732 {
6733         mlx5_ib_stage_common_roce_cleanup(dev);
6734 }
6735
6736 static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev)
6737 {
6738         struct mlx5_core_dev *mdev = dev->mdev;
6739         enum rdma_link_layer ll;
6740         int port_type_cap;
6741         int err;
6742
6743         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6744         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6745
6746         if (ll == IB_LINK_LAYER_ETHERNET) {
6747                 err = mlx5_ib_stage_common_roce_init(dev);
6748                 if (err)
6749                         return err;
6750
6751                 err = mlx5_enable_eth(dev);
6752                 if (err)
6753                         goto cleanup;
6754         }
6755
6756         return 0;
6757 cleanup:
6758         mlx5_ib_stage_common_roce_cleanup(dev);
6759
6760         return err;
6761 }
6762
6763 static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev)
6764 {
6765         struct mlx5_core_dev *mdev = dev->mdev;
6766         enum rdma_link_layer ll;
6767         int port_type_cap;
6768
6769         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6770         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6771
6772         if (ll == IB_LINK_LAYER_ETHERNET) {
6773                 mlx5_disable_eth(dev);
6774                 mlx5_ib_stage_common_roce_cleanup(dev);
6775         }
6776 }
6777
6778 static int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
6779 {
6780         return create_dev_resources(&dev->devr);
6781 }
6782
6783 static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
6784 {
6785         destroy_dev_resources(&dev->devr);
6786 }
6787
6788 static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
6789 {
6790         return mlx5_ib_odp_init_one(dev);
6791 }
6792
6793 static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
6794 {
6795         mlx5_ib_odp_cleanup_one(dev);
6796 }
6797
6798 static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
6799         .alloc_hw_stats = mlx5_ib_alloc_hw_stats,
6800         .get_hw_stats = mlx5_ib_get_hw_stats,
6801         .counter_bind_qp = mlx5_ib_counter_bind_qp,
6802         .counter_unbind_qp = mlx5_ib_counter_unbind_qp,
6803         .counter_dealloc = mlx5_ib_counter_dealloc,
6804         .counter_alloc_stats = mlx5_ib_counter_alloc_stats,
6805         .counter_update_stats = mlx5_ib_counter_update_stats,
6806 };
6807
6808 static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
6809 {
6810         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
6811                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_hw_stats_ops);
6812
6813                 return mlx5_ib_alloc_counters(dev);
6814         }
6815
6816         return 0;
6817 }
6818
6819 static void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
6820 {
6821         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
6822                 mlx5_ib_dealloc_counters(dev);
6823 }
6824
6825 static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
6826 {
6827         mlx5_ib_init_cong_debugfs(dev,
6828                                   mlx5_core_native_port_num(dev->mdev) - 1);
6829         return 0;
6830 }
6831
6832 static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
6833 {
6834         mlx5_ib_cleanup_cong_debugfs(dev,
6835                                      mlx5_core_native_port_num(dev->mdev) - 1);
6836 }
6837
6838 static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
6839 {
6840         dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
6841         return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
6842 }
6843
6844 static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
6845 {
6846         mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
6847 }
6848
6849 static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
6850 {
6851         int err;
6852
6853         err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
6854         if (err)
6855                 return err;
6856
6857         err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
6858         if (err)
6859                 mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
6860
6861         return err;
6862 }
6863
6864 static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
6865 {
6866         mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
6867         mlx5_free_bfreg(dev->mdev, &dev->bfreg);
6868 }
6869
6870 static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
6871 {
6872         const char *name;
6873
6874         rdma_set_device_sysfs_group(&dev->ib_dev, &mlx5_attr_group);
6875         if (!mlx5_lag_is_roce(dev->mdev))
6876                 name = "mlx5_%d";
6877         else
6878                 name = "mlx5_bond_%d";
6879         return ib_register_device(&dev->ib_dev, name);
6880 }
6881
6882 static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
6883 {
6884         destroy_umrc_res(dev);
6885 }
6886
6887 static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
6888 {
6889         ib_unregister_device(&dev->ib_dev);
6890 }
6891
6892 static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
6893 {
6894         return create_umr_res(dev);
6895 }
6896
6897 static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
6898 {
6899         init_delay_drop(dev);
6900
6901         return 0;
6902 }
6903
6904 static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
6905 {
6906         cancel_delay_drop(dev);
6907 }
6908
6909 static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
6910 {
6911         dev->mdev_events.notifier_call = mlx5_ib_event;
6912         mlx5_notifier_register(dev->mdev, &dev->mdev_events);
6913         return 0;
6914 }
6915
6916 static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
6917 {
6918         mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
6919 }
6920
6921 static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev)
6922 {
6923         int uid;
6924
6925         uid = mlx5_ib_devx_create(dev, false);
6926         if (uid > 0) {
6927                 dev->devx_whitelist_uid = uid;
6928                 mlx5_ib_devx_init_event_table(dev);
6929         }
6930
6931         return 0;
6932 }
6933 static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev)
6934 {
6935         if (dev->devx_whitelist_uid) {
6936                 mlx5_ib_devx_cleanup_event_table(dev);
6937                 mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
6938         }
6939 }
6940
6941 int mlx5_ib_enable_driver(struct ib_device *dev)
6942 {
6943         struct mlx5_ib_dev *mdev = to_mdev(dev);
6944         int ret;
6945
6946         ret = mlx5_ib_test_wc(mdev);
6947         mlx5_ib_dbg(mdev, "Write-Combining %s",
6948                     mdev->wc_support ? "supported" : "not supported");
6949
6950         return ret;
6951 }
6952
6953 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
6954                       const struct mlx5_ib_profile *profile,
6955                       int stage)
6956 {
6957         dev->ib_active = false;
6958
6959         /* Number of stages to cleanup */
6960         while (stage) {
6961                 stage--;
6962                 if (profile->stage[stage].cleanup)
6963                         profile->stage[stage].cleanup(dev);
6964         }
6965
6966         kfree(dev->port);
6967         ib_dealloc_device(&dev->ib_dev);
6968 }
6969
6970 void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
6971                     const struct mlx5_ib_profile *profile)
6972 {
6973         int err;
6974         int i;
6975
6976         for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
6977                 if (profile->stage[i].init) {
6978                         err = profile->stage[i].init(dev);
6979                         if (err)
6980                                 goto err_out;
6981                 }
6982         }
6983
6984         dev->profile = profile;
6985         dev->ib_active = true;
6986
6987         return dev;
6988
6989 err_out:
6990         __mlx5_ib_remove(dev, profile, i);
6991
6992         return NULL;
6993 }
6994
6995 static const struct mlx5_ib_profile pf_profile = {
6996         STAGE_CREATE(MLX5_IB_STAGE_INIT,
6997                      mlx5_ib_stage_init_init,
6998                      mlx5_ib_stage_init_cleanup),
6999         STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
7000                      mlx5_ib_stage_flow_db_init,
7001                      mlx5_ib_stage_flow_db_cleanup),
7002         STAGE_CREATE(MLX5_IB_STAGE_CAPS,
7003                      mlx5_ib_stage_caps_init,
7004                      mlx5_ib_stage_caps_cleanup),
7005         STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
7006                      mlx5_ib_stage_non_default_cb,
7007                      NULL),
7008         STAGE_CREATE(MLX5_IB_STAGE_ROCE,
7009                      mlx5_ib_stage_roce_init,
7010                      mlx5_ib_stage_roce_cleanup),
7011         STAGE_CREATE(MLX5_IB_STAGE_SRQ,
7012                      mlx5_init_srq_table,
7013                      mlx5_cleanup_srq_table),
7014         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
7015                      mlx5_ib_stage_dev_res_init,
7016                      mlx5_ib_stage_dev_res_cleanup),
7017         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
7018                      mlx5_ib_stage_dev_notifier_init,
7019                      mlx5_ib_stage_dev_notifier_cleanup),
7020         STAGE_CREATE(MLX5_IB_STAGE_ODP,
7021                      mlx5_ib_stage_odp_init,
7022                      mlx5_ib_stage_odp_cleanup),
7023         STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
7024                      mlx5_ib_stage_counters_init,
7025                      mlx5_ib_stage_counters_cleanup),
7026         STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
7027                      mlx5_ib_stage_cong_debugfs_init,
7028                      mlx5_ib_stage_cong_debugfs_cleanup),
7029         STAGE_CREATE(MLX5_IB_STAGE_UAR,
7030                      mlx5_ib_stage_uar_init,
7031                      mlx5_ib_stage_uar_cleanup),
7032         STAGE_CREATE(MLX5_IB_STAGE_BFREG,
7033                      mlx5_ib_stage_bfrag_init,
7034                      mlx5_ib_stage_bfrag_cleanup),
7035         STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
7036                      NULL,
7037                      mlx5_ib_stage_pre_ib_reg_umr_cleanup),
7038         STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
7039                      mlx5_ib_stage_devx_init,
7040                      mlx5_ib_stage_devx_cleanup),
7041         STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
7042                      mlx5_ib_stage_ib_reg_init,
7043                      mlx5_ib_stage_ib_reg_cleanup),
7044         STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
7045                      mlx5_ib_stage_post_ib_reg_umr_init,
7046                      NULL),
7047         STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
7048                      mlx5_ib_stage_delay_drop_init,
7049                      mlx5_ib_stage_delay_drop_cleanup),
7050 };
7051
7052 const struct mlx5_ib_profile raw_eth_profile = {
7053         STAGE_CREATE(MLX5_IB_STAGE_INIT,
7054                      mlx5_ib_stage_init_init,
7055                      mlx5_ib_stage_init_cleanup),
7056         STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
7057                      mlx5_ib_stage_flow_db_init,
7058                      mlx5_ib_stage_flow_db_cleanup),
7059         STAGE_CREATE(MLX5_IB_STAGE_CAPS,
7060                      mlx5_ib_stage_caps_init,
7061                      mlx5_ib_stage_caps_cleanup),
7062         STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
7063                      mlx5_ib_stage_raw_eth_non_default_cb,
7064                      NULL),
7065         STAGE_CREATE(MLX5_IB_STAGE_ROCE,
7066                      mlx5_ib_stage_raw_eth_roce_init,
7067                      mlx5_ib_stage_raw_eth_roce_cleanup),
7068         STAGE_CREATE(MLX5_IB_STAGE_SRQ,
7069                      mlx5_init_srq_table,
7070                      mlx5_cleanup_srq_table),
7071         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
7072                      mlx5_ib_stage_dev_res_init,
7073                      mlx5_ib_stage_dev_res_cleanup),
7074         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
7075                      mlx5_ib_stage_dev_notifier_init,
7076                      mlx5_ib_stage_dev_notifier_cleanup),
7077         STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
7078                      mlx5_ib_stage_counters_init,
7079                      mlx5_ib_stage_counters_cleanup),
7080         STAGE_CREATE(MLX5_IB_STAGE_UAR,
7081                      mlx5_ib_stage_uar_init,
7082                      mlx5_ib_stage_uar_cleanup),
7083         STAGE_CREATE(MLX5_IB_STAGE_BFREG,
7084                      mlx5_ib_stage_bfrag_init,
7085                      mlx5_ib_stage_bfrag_cleanup),
7086         STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
7087                      NULL,
7088                      mlx5_ib_stage_pre_ib_reg_umr_cleanup),
7089         STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
7090                      mlx5_ib_stage_devx_init,
7091                      mlx5_ib_stage_devx_cleanup),
7092         STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
7093                      mlx5_ib_stage_ib_reg_init,
7094                      mlx5_ib_stage_ib_reg_cleanup),
7095         STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
7096                      mlx5_ib_stage_post_ib_reg_umr_init,
7097                      NULL),
7098 };
7099
7100 static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev)
7101 {
7102         struct mlx5_ib_multiport_info *mpi;
7103         struct mlx5_ib_dev *dev;
7104         bool bound = false;
7105         int err;
7106
7107         mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
7108         if (!mpi)
7109                 return NULL;
7110
7111         mpi->mdev = mdev;
7112
7113         err = mlx5_query_nic_vport_system_image_guid(mdev,
7114                                                      &mpi->sys_image_guid);
7115         if (err) {
7116                 kfree(mpi);
7117                 return NULL;
7118         }
7119
7120         mutex_lock(&mlx5_ib_multiport_mutex);
7121         list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
7122                 if (dev->sys_image_guid == mpi->sys_image_guid)
7123                         bound = mlx5_ib_bind_slave_port(dev, mpi);
7124
7125                 if (bound) {
7126                         rdma_roce_rescan_device(&dev->ib_dev);
7127                         break;
7128                 }
7129         }
7130
7131         if (!bound) {
7132                 list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
7133                 dev_dbg(mdev->device,
7134                         "no suitable IB device found to bind to, added to unaffiliated list.\n");
7135         }
7136         mutex_unlock(&mlx5_ib_multiport_mutex);
7137
7138         return mpi;
7139 }
7140
7141 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
7142 {
7143         const struct mlx5_ib_profile *profile;
7144         enum rdma_link_layer ll;
7145         struct mlx5_ib_dev *dev;
7146         int port_type_cap;
7147         int num_ports;
7148
7149         printk_once(KERN_INFO "%s", mlx5_version);
7150
7151         if (MLX5_ESWITCH_MANAGER(mdev) &&
7152             mlx5_ib_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) {
7153                 if (!mlx5_core_mp_enabled(mdev))
7154                         mlx5_ib_register_vport_reps(mdev);
7155                 return mdev;
7156         }
7157
7158         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
7159         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
7160
7161         if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET)
7162                 return mlx5_ib_add_slave_port(mdev);
7163
7164         num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
7165                         MLX5_CAP_GEN(mdev, num_vhca_ports));
7166         dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
7167         if (!dev)
7168                 return NULL;
7169         dev->port = kcalloc(num_ports, sizeof(*dev->port),
7170                              GFP_KERNEL);
7171         if (!dev->port) {
7172                 ib_dealloc_device(&dev->ib_dev);
7173                 return NULL;
7174         }
7175
7176         dev->mdev = mdev;
7177         dev->num_ports = num_ports;
7178
7179         if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_enabled(mdev))
7180                 profile = &raw_eth_profile;
7181         else
7182                 profile = &pf_profile;
7183
7184         return __mlx5_ib_add(dev, profile);
7185 }
7186
7187 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
7188 {
7189         struct mlx5_ib_multiport_info *mpi;
7190         struct mlx5_ib_dev *dev;
7191
7192         if (MLX5_ESWITCH_MANAGER(mdev) && context == mdev) {
7193                 mlx5_ib_unregister_vport_reps(mdev);
7194                 return;
7195         }
7196
7197         if (mlx5_core_is_mp_slave(mdev)) {
7198                 mpi = context;
7199                 mutex_lock(&mlx5_ib_multiport_mutex);
7200                 if (mpi->ibdev)
7201                         mlx5_ib_unbind_slave_port(mpi->ibdev, mpi);
7202                 list_del(&mpi->list);
7203                 mutex_unlock(&mlx5_ib_multiport_mutex);
7204                 kfree(mpi);
7205                 return;
7206         }
7207
7208         dev = context;
7209         __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
7210 }
7211
7212 static struct mlx5_interface mlx5_ib_interface = {
7213         .add            = mlx5_ib_add,
7214         .remove         = mlx5_ib_remove,
7215         .protocol       = MLX5_INTERFACE_PROTOCOL_IB,
7216 };
7217
7218 unsigned long mlx5_ib_get_xlt_emergency_page(void)
7219 {
7220         mutex_lock(&xlt_emergency_page_mutex);
7221         return xlt_emergency_page;
7222 }
7223
7224 void mlx5_ib_put_xlt_emergency_page(void)
7225 {
7226         mutex_unlock(&xlt_emergency_page_mutex);
7227 }
7228
7229 static int __init mlx5_ib_init(void)
7230 {
7231         int err;
7232
7233         xlt_emergency_page = __get_free_page(GFP_KERNEL);
7234         if (!xlt_emergency_page)
7235                 return -ENOMEM;
7236
7237         mutex_init(&xlt_emergency_page_mutex);
7238
7239         mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
7240         if (!mlx5_ib_event_wq) {
7241                 free_page(xlt_emergency_page);
7242                 return -ENOMEM;
7243         }
7244
7245         mlx5_ib_odp_init();
7246
7247         err = mlx5_register_interface(&mlx5_ib_interface);
7248
7249         return err;
7250 }
7251
7252 static void __exit mlx5_ib_cleanup(void)
7253 {
7254         mlx5_unregister_interface(&mlx5_ib_interface);
7255         destroy_workqueue(mlx5_ib_event_wq);
7256         mutex_destroy(&xlt_emergency_page_mutex);
7257         free_page(xlt_emergency_page);
7258 }
7259
7260 module_init(mlx5_ib_init);
7261 module_exit(mlx5_ib_cleanup);