kernel/bpf/devmap.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
   3  */
   4
   5 /* Devmaps primary use is as a backend map for XDP BPF helper call
   6  * bpf_redirect_map(). Because XDP is mostly concerned with performance we
   7  * spent some effort to ensure the datapath with redirect maps does not use
   8  * any locking. This is a quick note on the details.
   9  *
  10  * We have three possible paths to get into the devmap control plane bpf
  11  * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
  12  * will invoke an update, delete, or lookup operation. To ensure updates and
  13  * deletes appear atomic from the datapath side xchg() is used to modify the
  14  * netdev_map array. Then because the datapath does a lookup into the netdev_map
  15  * array (read-only) from an RCU critical section we use call_rcu() to wait for
  16  * an rcu grace period before free'ing the old data structures. This ensures the
  17  * datapath always has a valid copy. However, the datapath does a "flush"
  18  * operation that pushes any pending packets in the driver outside the RCU
  19  * critical section. Each bpf_dtab_netdev tracks these pending operations using
  20  * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed  until
  21  * this list is empty, indicating outstanding flush operations have completed.
  22  *
  23  * BPF syscalls may race with BPF program calls on any of the update, delete
  24  * or lookup operations. As noted above the xchg() operation also keep the
  25  * netdev_map consistent in this case. From the devmap side BPF programs
  26  * calling into these operations are the same as multiple user space threads
  27  * making system calls.
  28  *
  29  * Finally, any of the above may race with a netdev_unregister notifier. The
  30  * unregister notifier must search for net devices in the map structure that
  31  * contain a reference to the net device and remove them. This is a two step
  32  * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
  33  * check to see if the ifindex is the same as the net_device being removed.
  34  * When removing the dev a cmpxchg() is used to ensure the correct dev is
  35  * removed, in the case of a concurrent update or delete operation it is
  36  * possible that the initially referenced dev is no longer in the map. As the
  37  * notifier hook walks the map we know that new dev references can not be
  38  * added by the user because core infrastructure ensures dev_get_by_index()
  39  * calls will fail at this point.
  40  *
  41  * The devmap_hash type is a map type which interprets keys as ifindexes and
  42  * indexes these using a hashmap. This allows maps that use ifindex as key to be
  43  * densely packed instead of having holes in the lookup array for unused
  44  * ifindexes. The setup and packet enqueue/send code is shared between the two
  45  * types of devmap; only the lookup and insertion is different.
  46  */
  47 #include <linux/bpf.h>
  48 #include <net/xdp.h>
  49 #include <linux/filter.h>
  50 #include <trace/events/xdp.h>
  51
  52 #define DEV_CREATE_FLAG_MASK \
  53         (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
  54
  55 #define DEV_MAP_BULK_SIZE 16
  56 struct xdp_dev_bulk_queue {
  57         struct xdp_frame *q[DEV_MAP_BULK_SIZE];
  58         struct list_head flush_node;
  59         struct net_device *dev;
  60         struct net_device *dev_rx;
  61         unsigned int count;
  62 };
  63
  64 struct bpf_dtab_netdev {
  65         struct net_device *dev; /* must be first member, due to tracepoint */
  66         struct hlist_node index_hlist;
  67         struct bpf_dtab *dtab;
  68         struct rcu_head rcu;
  69         unsigned int idx;
  70 };
  71
  72 struct bpf_dtab {
  73         struct bpf_map map;
  74         struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
  75         struct list_head list;
  76
  77         /* these are only used for DEVMAP_HASH type maps */
  78         struct hlist_head *dev_index_head;
  79         spinlock_t index_lock;
  80         unsigned int items;
  81         u32 n_buckets;
  82 };
  83
  84 static DEFINE_PER_CPU(struct list_head, dev_flush_list);
  85 static DEFINE_SPINLOCK(dev_map_lock);
  86 static LIST_HEAD(dev_map_list);
  87
  88 static struct hlist_head *dev_map_create_hash(unsigned int entries)
  89 {
  90         int i;
  91         struct hlist_head *hash;
  92
  93         hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
  94         if (hash != NULL)
  95                 for (i = 0; i < entries; i++)
  96                         INIT_HLIST_HEAD(&hash[i]);
  97
  98         return hash;
  99 }
 100
 101 static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
 102                                                     int idx)
 103 {
 104         return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
 105 }
 106
 107 static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 108 {
 109         u64 cost = 0;
 110         int err;
 111
 112         /* check sanity of attributes */
 113         if (attr->max_entries == 0 || attr->key_size != 4 ||
 114             attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 115                 return -EINVAL;
 116
 117         /* Lookup returns a pointer straight to dev->ifindex, so make sure the
 118          * verifier prevents writes from the BPF side
 119          */
 120         attr->map_flags |= BPF_F_RDONLY_PROG;
 121
 122
 123         bpf_map_init_from_attr(&dtab->map, attr);
 124
 125         if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
 126                 dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
 127
 128                 if (!dtab->n_buckets) /* Overflow check */
 129                         return -EINVAL;
 130                 cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets;
 131         } else {
 132                 cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 133         }
 134
 135         /* if map size is larger than memlock limit, reject it */
 136         err = bpf_map_charge_init(&dtab->map.memory, cost);
 137         if (err)
 138                 return -EINVAL;
 139
 140         if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
 141                 dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
 142                 if (!dtab->dev_index_head)
 143                         goto free_charge;
 144
 145                 spin_lock_init(&dtab->index_lock);
 146         } else {
 147                 dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 148                                                       sizeof(struct bpf_dtab_netdev *),
 149                                                       dtab->map.numa_node);
 150                 if (!dtab->netdev_map)
 151                         goto free_charge;
 152         }
 153
 154         return 0;
 155
 156 free_charge:
 157         bpf_map_charge_finish(&dtab->map.memory);
 158         return -ENOMEM;
 159 }
 160
 161 static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 162 {
 163         struct bpf_dtab *dtab;
 164         int err;
 165
 166         if (!capable(CAP_NET_ADMIN))
 167                 return ERR_PTR(-EPERM);
 168
 169         dtab = kzalloc(sizeof(*dtab), GFP_USER);
 170         if (!dtab)
 171                 return ERR_PTR(-ENOMEM);
 172
 173         err = dev_map_init_map(dtab, attr);
 174         if (err) {
 175                 kfree(dtab);
 176                 return ERR_PTR(err);
 177         }
 178
 179         spin_lock(&dev_map_lock);
 180         list_add_tail_rcu(&dtab->list, &dev_map_list);
 181         spin_unlock(&dev_map_lock);
 182
 183         return &dtab->map;
 184 }
 185
 186 static void dev_map_free(struct bpf_map *map)
 187 {
 188         struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 189         int i;
 190
 191         /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
 192          * so the programs (can be more than one that used this map) were
 193          * disconnected from events. The following synchronize_rcu() guarantees
 194          * both rcu read critical sections complete and waits for
 195          * preempt-disable regions (NAPI being the relevant context here) so we
 196          * are certain there will be no further reads against the netdev_map and
 197          * all flush operations are complete. Flush operations can only be done
 198          * from NAPI context for this reason.
 199          */
 200
 201         spin_lock(&dev_map_lock);
 202         list_del_rcu(&dtab->list);
 203         spin_unlock(&dev_map_lock);
 204
 205         bpf_clear_redirect_map(map);
 206         synchronize_rcu();
 207
 208         /* Make sure prior __dev_map_entry_free() have completed. */
 209         rcu_barrier();
 210
 211         if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
 212                 for (i = 0; i < dtab->n_buckets; i++) {
 213                         struct bpf_dtab_netdev *dev;
 214                         struct hlist_head *head;
 215                         struct hlist_node *next;
 216
 217                         head = dev_map_index_hash(dtab, i);
 218
 219                         hlist_for_each_entry_safe(dev, next, head, index_hlist) {
 220                                 hlist_del_rcu(&dev->index_hlist);
 221                                 dev_put(dev->dev);
 222                                 kfree(dev);
 223                         }
 224                 }
 225
 226                 kfree(dtab->dev_index_head);
 227         } else {
 228                 for (i = 0; i < dtab->map.max_entries; i++) {
 229                         struct bpf_dtab_netdev *dev;
 230
 231                         dev = dtab->netdev_map[i];
 232                         if (!dev)
 233                                 continue;
 234
 235                         dev_put(dev->dev);
 236                         kfree(dev);
 237                 }
 238
 239                 bpf_map_area_free(dtab->netdev_map);
 240         }
 241
 242         kfree(dtab);
 243 }
 244
 245 static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 246 {
 247         struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 248         u32 index = key ? *(u32 *)key : U32_MAX;
 249         u32 *next = next_key;
 250
 251         if (index >= dtab->map.max_entries) {
 252                 *next = 0;
 253                 return 0;
 254         }
 255
 256         if (index == dtab->map.max_entries - 1)
 257                 return -ENOENT;
 258         *next = index + 1;
 259         return 0;
 260 }
 261
 262 struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
 263 {
 264         struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 265         struct hlist_head *head = dev_map_index_hash(dtab, key);
 266         struct bpf_dtab_netdev *dev;
 267
 268         hlist_for_each_entry_rcu(dev, head, index_hlist,
 269                                  lockdep_is_held(&dtab->index_lock))
 270                 if (dev->idx == key)
 271                         return dev;
 272
 273         return NULL;
 274 }
 275
 276 static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
 277                                     void *next_key)
 278 {
 279         struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 280         u32 idx, *next = next_key;
 281         struct bpf_dtab_netdev *dev, *next_dev;
 282         struct hlist_head *head;
 283         int i = 0;
 284
 285         if (!key)
 286                 goto find_first;
 287
 288         idx = *(u32 *)key;
 289
 290         dev = __dev_map_hash_lookup_elem(map, idx);
 291         if (!dev)
 292                 goto find_first;
 293
 294         next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
 295                                     struct bpf_dtab_netdev, index_hlist);
 296
 297         if (next_dev) {
 298                 *next = next_dev->idx;
 299                 return 0;
 300         }
 301
 302         i = idx & (dtab->n_buckets - 1);
 303         i++;
 304
 305  find_first:
 306         for (; i < dtab->n_buckets; i++) {
 307                 head = dev_map_index_hash(dtab, i);
 308
 309                 next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
 310                                             struct bpf_dtab_netdev,
 311                                             index_hlist);
 312                 if (next_dev) {
 313                         *next = next_dev->idx;
 314                         return 0;
 315                 }
 316         }
 317
 318         return -ENOENT;
 319 }
 320
 321 static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
 322 {
 323         struct net_device *dev = bq->dev;
 324         int sent = 0, drops = 0, err = 0;
 325         int i;
 326
 327         if (unlikely(!bq->count))
 328                 return 0;
 329
 330         for (i = 0; i < bq->count; i++) {
 331                 struct xdp_frame *xdpf = bq->q[i];
 332
 333                 prefetch(xdpf);
 334         }
 335
 336         sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
 337         if (sent < 0) {
 338                 err = sent;
 339                 sent = 0;
 340                 goto error;
 341         }
 342         drops = bq->count - sent;
 343 out:
 344         bq->count = 0;
 345
 346         trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
 347         bq->dev_rx = NULL;
 348         __list_del_clearprev(&bq->flush_node);
 349         return 0;
 350 error:
 351         /* If ndo_xdp_xmit fails with an errno, no frames have been
 352          * xmit'ed and it's our responsibility to them free all.
 353          */
 354         for (i = 0; i < bq->count; i++) {
 355                 struct xdp_frame *xdpf = bq->q[i];
 356
 357                 xdp_return_frame_rx_napi(xdpf);
 358                 drops++;
 359         }
 360         goto out;
 361 }
 362
 363 /* __dev_flush is called from xdp_do_flush() which _must_ be signaled
 364  * from the driver before returning from its napi->poll() routine. The poll()
 365  * routine is called either from busy_poll context or net_rx_action signaled
 366  * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
 367  * net device can be torn down. On devmap tear down we ensure the flush list
 368  * is empty before completing to ensure all flush operations have completed.
 369  * When drivers update the bpf program they may need to ensure any flush ops
 370  * are also complete. Using synchronize_rcu or call_rcu will suffice for this
 371  * because both wait for napi context to exit.
 372  */
 373 void __dev_flush(void)
 374 {
 375         struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
 376         struct xdp_dev_bulk_queue *bq, *tmp;
 377
 378         list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
 379                 bq_xmit_all(bq, XDP_XMIT_FLUSH);
 380 }
 381
 382 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
 383  * update happens in parallel here a dev_put wont happen until after reading the
 384  * ifindex.
 385  */
 386 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 387 {
 388         struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 389         struct bpf_dtab_netdev *obj;
 390
 391         if (key >= map->max_entries)
 392                 return NULL;
 393
 394         obj = READ_ONCE(dtab->netdev_map[key]);
 395         return obj;
 396 }
 397
 398 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
 399  * Thus, safe percpu variable access.
 400  */
 401 static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 402                       struct net_device *dev_rx)
 403 {
 404         struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
 405         struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
 406
 407         if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
 408                 bq_xmit_all(bq, 0);
 409
 410         /* Ingress dev_rx will be the same for all xdp_frame's in
 411          * bulk_queue, because bq stored per-CPU and must be flushed
 412          * from net_device drivers NAPI func end.
 413          */
 414         if (!bq->dev_rx)
 415                 bq->dev_rx = dev_rx;
 416
 417         bq->q[bq->count++] = xdpf;
 418
 419         if (!bq->flush_node.prev)
 420                 list_add(&bq->flush_node, flush_list);
 421
 422         return 0;
 423 }
 424
 425 static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 426                                struct net_device *dev_rx)
 427 {
 428         struct xdp_frame *xdpf;
 429         int err;
 430
 431         if (!dev->netdev_ops->ndo_xdp_xmit)
 432                 return -EOPNOTSUPP;
 433
 434         err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
 435         if (unlikely(err))
 436                 return err;
 437
 438         xdpf = convert_to_xdp_frame(xdp);
 439         if (unlikely(!xdpf))
 440                 return -EOVERFLOW;
 441
 442         return bq_enqueue(dev, xdpf, dev_rx);
 443 }
 444
 445 int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 446                     struct net_device *dev_rx)
 447 {
 448         return __xdp_enqueue(dev, xdp, dev_rx);
 449 }
 450
 451 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 452                     struct net_device *dev_rx)
 453 {
 454         struct net_device *dev = dst->dev;
 455
 456         return __xdp_enqueue(dev, xdp, dev_rx);
 457 }
 458
 459 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 460                              struct bpf_prog *xdp_prog)
 461 {
 462         int err;
 463
 464         err = xdp_ok_fwd_dev(dst->dev, skb->len);
 465         if (unlikely(err))
 466                 return err;
 467         skb->dev = dst->dev;
 468         generic_xdp_tx(skb, xdp_prog);
 469
 470         return 0;
 471 }
 472
 473 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 474 {
 475         struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
 476         struct net_device *dev = obj ? obj->dev : NULL;
 477
 478         return dev ? &dev->ifindex : NULL;
 479 }
 480
 481 static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
 482 {
 483         struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
 484                                                                 *(u32 *)key);
 485         struct net_device *dev = obj ? obj->dev : NULL;
 486
 487         return dev ? &dev->ifindex : NULL;
 488 }
 489
 490 static void __dev_map_entry_free(struct rcu_head *rcu)
 491 {
 492         struct bpf_dtab_netdev *dev;
 493
 494         dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
 495         dev_put(dev->dev);
 496         kfree(dev);
 497 }
 498
 499 static int dev_map_delete_elem(struct bpf_map *map, void *key)
 500 {
 501         struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 502         struct bpf_dtab_netdev *old_dev;
 503         int k = *(u32 *)key;
 504
 505         if (k >= map->max_entries)
 506                 return -EINVAL;
 507
 508         /* Use call_rcu() here to ensure any rcu critical sections have
 509          * completed as well as any flush operations because call_rcu
 510          * will wait for preempt-disable region to complete, NAPI in this
 511          * context.  And additionally, the driver tear down ensures all
 512          * soft irqs are complete before removing the net device in the
 513          * case of dev_put equals zero.
 514          */
 515         old_dev = xchg(&dtab->netdev_map[k], NULL);
 516         if (old_dev)
 517                 call_rcu(&old_dev->rcu, __dev_map_entry_free);
 518         return 0;
 519 }
 520
 521 static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
 522 {
 523         struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 524         struct bpf_dtab_netdev *old_dev;
 525         int k = *(u32 *)key;
 526         unsigned long flags;
 527         int ret = -ENOENT;
 528
 529         spin_lock_irqsave(&dtab->index_lock, flags);
 530
 531         old_dev = __dev_map_hash_lookup_elem(map, k);
 532         if (old_dev) {
 533                 dtab->items--;
 534                 hlist_del_init_rcu(&old_dev->index_hlist);
 535                 call_rcu(&old_dev->rcu, __dev_map_entry_free);
 536                 ret = 0;
 537         }
 538         spin_unlock_irqrestore(&dtab->index_lock, flags);
 539
 540         return ret;
 541 }
 542
 543 static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
 544                                                     struct bpf_dtab *dtab,
 545                                                     u32 ifindex,
 546                                                     unsigned int idx)
 547 {
 548         struct bpf_dtab_netdev *dev;
 549
 550         dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
 551                            dtab->map.numa_node);
 552         if (!dev)
 553                 return ERR_PTR(-ENOMEM);
 554
 555         dev->dev = dev_get_by_index(net, ifindex);
 556         if (!dev->dev) {
 557                 kfree(dev);
 558                 return ERR_PTR(-EINVAL);
 559         }
 560
 561         dev->idx = idx;
 562         dev->dtab = dtab;
 563
 564         return dev;
 565 }
 566
 567 static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 568                                  void *key, void *value, u64 map_flags)
 569 {
 570         struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 571         struct bpf_dtab_netdev *dev, *old_dev;
 572         u32 ifindex = *(u32 *)value;
 573         u32 i = *(u32 *)key;
 574
 575         if (unlikely(map_flags > BPF_EXIST))
 576                 return -EINVAL;
 577         if (unlikely(i >= dtab->map.max_entries))
 578                 return -E2BIG;
 579         if (unlikely(map_flags == BPF_NOEXIST))
 580                 return -EEXIST;
 581
 582         if (!ifindex) {
 583                 dev = NULL;
 584         } else {
 585                 dev = __dev_map_alloc_node(net, dtab, ifindex, i);
 586                 if (IS_ERR(dev))
 587                         return PTR_ERR(dev);
 588         }
 589
 590         /* Use call_rcu() here to ensure rcu critical sections have completed
 591          * Remembering the driver side flush operation will happen before the
 592          * net device is removed.
 593          */
 594         old_dev = xchg(&dtab->netdev_map[i], dev);
 595         if (old_dev)
 596                 call_rcu(&old_dev->rcu, __dev_map_entry_free);
 597
 598         return 0;
 599 }
 600
 601 static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 602                                u64 map_flags)
 603 {
 604         return __dev_map_update_elem(current->nsproxy->net_ns,
 605                                      map, key, value, map_flags);
 606 }
 607
 608 static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
 609                                      void *key, void *value, u64 map_flags)
 610 {
 611         struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 612         struct bpf_dtab_netdev *dev, *old_dev;
 613         u32 ifindex = *(u32 *)value;
 614         u32 idx = *(u32 *)key;
 615         unsigned long flags;
 616         int err = -EEXIST;
 617
 618         if (unlikely(map_flags > BPF_EXIST || !ifindex))
 619                 return -EINVAL;
 620
 621         spin_lock_irqsave(&dtab->index_lock, flags);
 622
 623         old_dev = __dev_map_hash_lookup_elem(map, idx);
 624         if (old_dev && (map_flags & BPF_NOEXIST))
 625                 goto out_err;
 626
 627         dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
 628         if (IS_ERR(dev)) {
 629                 err = PTR_ERR(dev);
 630                 goto out_err;
 631         }
 632
 633         if (old_dev) {
 634                 hlist_del_rcu(&old_dev->index_hlist);
 635         } else {
 636                 if (dtab->items >= dtab->map.max_entries) {
 637                         spin_unlock_irqrestore(&dtab->index_lock, flags);
 638                         call_rcu(&dev->rcu, __dev_map_entry_free);
 639                         return -E2BIG;
 640                 }
 641                 dtab->items++;
 642         }
 643
 644         hlist_add_head_rcu(&dev->index_hlist,
 645                            dev_map_index_hash(dtab, idx));
 646         spin_unlock_irqrestore(&dtab->index_lock, flags);
 647
 648         if (old_dev)
 649                 call_rcu(&old_dev->rcu, __dev_map_entry_free);
 650
 651         return 0;
 652
 653 out_err:
 654         spin_unlock_irqrestore(&dtab->index_lock, flags);
 655         return err;
 656 }
 657
 658 static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
 659                                    u64 map_flags)
 660 {
 661         return __dev_map_hash_update_elem(current->nsproxy->net_ns,
 662                                          map, key, value, map_flags);
 663 }
 664
 665 const struct bpf_map_ops dev_map_ops = {
 666         .map_alloc = dev_map_alloc,
 667         .map_free = dev_map_free,
 668         .map_get_next_key = dev_map_get_next_key,
 669         .map_lookup_elem = dev_map_lookup_elem,
 670         .map_update_elem = dev_map_update_elem,
 671         .map_delete_elem = dev_map_delete_elem,
 672         .map_check_btf = map_check_no_btf,
 673 };
 674
 675 const struct bpf_map_ops dev_map_hash_ops = {
 676         .map_alloc = dev_map_alloc,
 677         .map_free = dev_map_free,
 678         .map_get_next_key = dev_map_hash_get_next_key,
 679         .map_lookup_elem = dev_map_hash_lookup_elem,
 680         .map_update_elem = dev_map_hash_update_elem,
 681         .map_delete_elem = dev_map_hash_delete_elem,
 682         .map_check_btf = map_check_no_btf,
 683 };
 684
 685 static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
 686                                        struct net_device *netdev)
 687 {
 688         unsigned long flags;
 689         u32 i;
 690
 691         spin_lock_irqsave(&dtab->index_lock, flags);
 692         for (i = 0; i < dtab->n_buckets; i++) {
 693                 struct bpf_dtab_netdev *dev;
 694                 struct hlist_head *head;
 695                 struct hlist_node *next;
 696
 697                 head = dev_map_index_hash(dtab, i);
 698
 699                 hlist_for_each_entry_safe(dev, next, head, index_hlist) {
 700                         if (netdev != dev->dev)
 701                                 continue;
 702
 703                         dtab->items--;
 704                         hlist_del_rcu(&dev->index_hlist);
 705                         call_rcu(&dev->rcu, __dev_map_entry_free);
 706                 }
 707         }
 708         spin_unlock_irqrestore(&dtab->index_lock, flags);
 709 }
 710
 711 static int dev_map_notification(struct notifier_block *notifier,
 712                                 ulong event, void *ptr)
 713 {
 714         struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
 715         struct bpf_dtab *dtab;
 716         int i, cpu;
 717
 718         switch (event) {
 719         case NETDEV_REGISTER:
 720                 if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
 721                         break;
 722
 723                 /* will be freed in free_netdev() */
 724                 netdev->xdp_bulkq =
 725                         __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue),
 726                                            sizeof(void *), GFP_ATOMIC);
 727                 if (!netdev->xdp_bulkq)
 728                         return NOTIFY_BAD;
 729
 730                 for_each_possible_cpu(cpu)
 731                         per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
 732                 break;
 733         case NETDEV_UNREGISTER:
 734                 /* This rcu_read_lock/unlock pair is needed because
 735                  * dev_map_list is an RCU list AND to ensure a delete
 736                  * operation does not free a netdev_map entry while we
 737                  * are comparing it against the netdev being unregistered.
 738                  */
 739                 rcu_read_lock();
 740                 list_for_each_entry_rcu(dtab, &dev_map_list, list) {
 741                         if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
 742                                 dev_map_hash_remove_netdev(dtab, netdev);
 743                                 continue;
 744                         }
 745
 746                         for (i = 0; i < dtab->map.max_entries; i++) {
 747                                 struct bpf_dtab_netdev *dev, *odev;
 748
 749                                 dev = READ_ONCE(dtab->netdev_map[i]);
 750                                 if (!dev || netdev != dev->dev)
 751                                         continue;
 752                                 odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
 753                                 if (dev == odev)
 754                                         call_rcu(&dev->rcu,
 755                                                  __dev_map_entry_free);
 756                         }
 757                 }
 758                 rcu_read_unlock();
 759                 break;
 760         default:
 761                 break;
 762         }
 763         return NOTIFY_OK;
 764 }
 765
 766 static struct notifier_block dev_map_notifier = {
 767         .notifier_call = dev_map_notification,
 768 };
 769
 770 static int __init dev_map_init(void)
 771 {
 772         int cpu;
 773
 774         /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
 775         BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
 776                      offsetof(struct _bpf_dtab_netdev, dev));
 777         register_netdevice_notifier(&dev_map_notifier);
 778
 779         for_each_possible_cpu(cpu)
 780                 INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
 781         return 0;
 782 }
 783
 784 subsys_initcall(dev_map_init);