Merge branch 'for-next' into for-linus
[linux-2.6-microblaze.git] / kernel / bpf / ringbuf.c
1 #include <linux/bpf.h>
2 #include <linux/btf.h>
3 #include <linux/err.h>
4 #include <linux/irq_work.h>
5 #include <linux/slab.h>
6 #include <linux/filter.h>
7 #include <linux/mm.h>
8 #include <linux/vmalloc.h>
9 #include <linux/wait.h>
10 #include <linux/poll.h>
11 #include <uapi/linux/btf.h>
12
13 #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
14
15 /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
16 #define RINGBUF_PGOFF \
17         (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
18 /* consumer page and producer page */
19 #define RINGBUF_POS_PAGES 2
20
21 #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
22
23 /* Maximum size of ring buffer area is limited by 32-bit page offset within
24  * record header, counted in pages. Reserve 8 bits for extensibility, and take
25  * into account few extra pages for consumer/producer pages and
26  * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single
27  * ring buffer.
28  */
29 #define RINGBUF_MAX_DATA_SZ \
30         (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
31
32 struct bpf_ringbuf {
33         wait_queue_head_t waitq;
34         struct irq_work work;
35         u64 mask;
36         struct page **pages;
37         int nr_pages;
38         spinlock_t spinlock ____cacheline_aligned_in_smp;
39         /* Consumer and producer counters are put into separate pages to allow
40          * mapping consumer page as r/w, but restrict producer page to r/o.
41          * This protects producer position from being modified by user-space
42          * application and ruining in-kernel position tracking.
43          */
44         unsigned long consumer_pos __aligned(PAGE_SIZE);
45         unsigned long producer_pos __aligned(PAGE_SIZE);
46         char data[] __aligned(PAGE_SIZE);
47 };
48
49 struct bpf_ringbuf_map {
50         struct bpf_map map;
51         struct bpf_ringbuf *rb;
52 };
53
54 /* 8-byte ring buffer record header structure */
55 struct bpf_ringbuf_hdr {
56         u32 len;
57         u32 pg_off;
58 };
59
60 static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
61 {
62         const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
63                             __GFP_NOWARN | __GFP_ZERO;
64         int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
65         int nr_data_pages = data_sz >> PAGE_SHIFT;
66         int nr_pages = nr_meta_pages + nr_data_pages;
67         struct page **pages, *page;
68         struct bpf_ringbuf *rb;
69         size_t array_size;
70         int i;
71
72         /* Each data page is mapped twice to allow "virtual"
73          * continuous read of samples wrapping around the end of ring
74          * buffer area:
75          * ------------------------------------------------------
76          * | meta pages |  real data pages  |  same data pages  |
77          * ------------------------------------------------------
78          * |            | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 |
79          * ------------------------------------------------------
80          * |            | TA             DA | TA             DA |
81          * ------------------------------------------------------
82          *                               ^^^^^^^
83          *                                  |
84          * Here, no need to worry about special handling of wrapped-around
85          * data due to double-mapped data pages. This works both in kernel and
86          * when mmap()'ed in user-space, simplifying both kernel and
87          * user-space implementations significantly.
88          */
89         array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
90         pages = bpf_map_area_alloc(array_size, numa_node);
91         if (!pages)
92                 return NULL;
93
94         for (i = 0; i < nr_pages; i++) {
95                 page = alloc_pages_node(numa_node, flags, 0);
96                 if (!page) {
97                         nr_pages = i;
98                         goto err_free_pages;
99                 }
100                 pages[i] = page;
101                 if (i >= nr_meta_pages)
102                         pages[nr_data_pages + i] = page;
103         }
104
105         rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
106                   VM_ALLOC | VM_USERMAP, PAGE_KERNEL);
107         if (rb) {
108                 rb->pages = pages;
109                 rb->nr_pages = nr_pages;
110                 return rb;
111         }
112
113 err_free_pages:
114         for (i = 0; i < nr_pages; i++)
115                 __free_page(pages[i]);
116         kvfree(pages);
117         return NULL;
118 }
119
120 static void bpf_ringbuf_notify(struct irq_work *work)
121 {
122         struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work);
123
124         wake_up_all(&rb->waitq);
125 }
126
127 static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
128 {
129         struct bpf_ringbuf *rb;
130
131         rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
132         if (!rb)
133                 return NULL;
134
135         spin_lock_init(&rb->spinlock);
136         init_waitqueue_head(&rb->waitq);
137         init_irq_work(&rb->work, bpf_ringbuf_notify);
138
139         rb->mask = data_sz - 1;
140         rb->consumer_pos = 0;
141         rb->producer_pos = 0;
142
143         return rb;
144 }
145
146 static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
147 {
148         struct bpf_ringbuf_map *rb_map;
149
150         if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
151                 return ERR_PTR(-EINVAL);
152
153         if (attr->key_size || attr->value_size ||
154             !is_power_of_2(attr->max_entries) ||
155             !PAGE_ALIGNED(attr->max_entries))
156                 return ERR_PTR(-EINVAL);
157
158 #ifdef CONFIG_64BIT
159         /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */
160         if (attr->max_entries > RINGBUF_MAX_DATA_SZ)
161                 return ERR_PTR(-E2BIG);
162 #endif
163
164         rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT);
165         if (!rb_map)
166                 return ERR_PTR(-ENOMEM);
167
168         bpf_map_init_from_attr(&rb_map->map, attr);
169
170         rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
171         if (!rb_map->rb) {
172                 kfree(rb_map);
173                 return ERR_PTR(-ENOMEM);
174         }
175
176         return &rb_map->map;
177 }
178
179 static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
180 {
181         /* copy pages pointer and nr_pages to local variable, as we are going
182          * to unmap rb itself with vunmap() below
183          */
184         struct page **pages = rb->pages;
185         int i, nr_pages = rb->nr_pages;
186
187         vunmap(rb);
188         for (i = 0; i < nr_pages; i++)
189                 __free_page(pages[i]);
190         kvfree(pages);
191 }
192
193 static void ringbuf_map_free(struct bpf_map *map)
194 {
195         struct bpf_ringbuf_map *rb_map;
196
197         rb_map = container_of(map, struct bpf_ringbuf_map, map);
198         bpf_ringbuf_free(rb_map->rb);
199         kfree(rb_map);
200 }
201
202 static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
203 {
204         return ERR_PTR(-ENOTSUPP);
205 }
206
207 static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
208                                    u64 flags)
209 {
210         return -ENOTSUPP;
211 }
212
213 static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
214 {
215         return -ENOTSUPP;
216 }
217
218 static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
219                                     void *next_key)
220 {
221         return -ENOTSUPP;
222 }
223
224 static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
225 {
226         struct bpf_ringbuf_map *rb_map;
227
228         rb_map = container_of(map, struct bpf_ringbuf_map, map);
229
230         if (vma->vm_flags & VM_WRITE) {
231                 /* allow writable mapping for the consumer_pos only */
232                 if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
233                         return -EPERM;
234         } else {
235                 vma->vm_flags &= ~VM_MAYWRITE;
236         }
237         /* remap_vmalloc_range() checks size and offset constraints */
238         return remap_vmalloc_range(vma, rb_map->rb,
239                                    vma->vm_pgoff + RINGBUF_PGOFF);
240 }
241
242 static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
243 {
244         unsigned long cons_pos, prod_pos;
245
246         cons_pos = smp_load_acquire(&rb->consumer_pos);
247         prod_pos = smp_load_acquire(&rb->producer_pos);
248         return prod_pos - cons_pos;
249 }
250
251 static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
252                                  struct poll_table_struct *pts)
253 {
254         struct bpf_ringbuf_map *rb_map;
255
256         rb_map = container_of(map, struct bpf_ringbuf_map, map);
257         poll_wait(filp, &rb_map->rb->waitq, pts);
258
259         if (ringbuf_avail_data_sz(rb_map->rb))
260                 return EPOLLIN | EPOLLRDNORM;
261         return 0;
262 }
263
264 static int ringbuf_map_btf_id;
265 const struct bpf_map_ops ringbuf_map_ops = {
266         .map_meta_equal = bpf_map_meta_equal,
267         .map_alloc = ringbuf_map_alloc,
268         .map_free = ringbuf_map_free,
269         .map_mmap = ringbuf_map_mmap,
270         .map_poll = ringbuf_map_poll,
271         .map_lookup_elem = ringbuf_map_lookup_elem,
272         .map_update_elem = ringbuf_map_update_elem,
273         .map_delete_elem = ringbuf_map_delete_elem,
274         .map_get_next_key = ringbuf_map_get_next_key,
275         .map_btf_name = "bpf_ringbuf_map",
276         .map_btf_id = &ringbuf_map_btf_id,
277 };
278
279 /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
280  * calculate offset from record metadata to ring buffer in pages, rounded
281  * down. This page offset is stored as part of record metadata and allows to
282  * restore struct bpf_ringbuf * from record pointer. This page offset is
283  * stored at offset 4 of record metadata header.
284  */
285 static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
286                                      struct bpf_ringbuf_hdr *hdr)
287 {
288         return ((void *)hdr - (void *)rb) >> PAGE_SHIFT;
289 }
290
291 /* Given pointer to ring buffer record header, restore pointer to struct
292  * bpf_ringbuf itself by using page offset stored at offset 4
293  */
294 static struct bpf_ringbuf *
295 bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
296 {
297         unsigned long addr = (unsigned long)(void *)hdr;
298         unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
299
300         return (void*)((addr & PAGE_MASK) - off);
301 }
302
303 static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
304 {
305         unsigned long cons_pos, prod_pos, new_prod_pos, flags;
306         u32 len, pg_off;
307         struct bpf_ringbuf_hdr *hdr;
308
309         if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
310                 return NULL;
311
312         len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
313         if (len > rb->mask + 1)
314                 return NULL;
315
316         cons_pos = smp_load_acquire(&rb->consumer_pos);
317
318         if (in_nmi()) {
319                 if (!spin_trylock_irqsave(&rb->spinlock, flags))
320                         return NULL;
321         } else {
322                 spin_lock_irqsave(&rb->spinlock, flags);
323         }
324
325         prod_pos = rb->producer_pos;
326         new_prod_pos = prod_pos + len;
327
328         /* check for out of ringbuf space by ensuring producer position
329          * doesn't advance more than (ringbuf_size - 1) ahead
330          */
331         if (new_prod_pos - cons_pos > rb->mask) {
332                 spin_unlock_irqrestore(&rb->spinlock, flags);
333                 return NULL;
334         }
335
336         hdr = (void *)rb->data + (prod_pos & rb->mask);
337         pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
338         hdr->len = size | BPF_RINGBUF_BUSY_BIT;
339         hdr->pg_off = pg_off;
340
341         /* pairs with consumer's smp_load_acquire() */
342         smp_store_release(&rb->producer_pos, new_prod_pos);
343
344         spin_unlock_irqrestore(&rb->spinlock, flags);
345
346         return (void *)hdr + BPF_RINGBUF_HDR_SZ;
347 }
348
349 BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
350 {
351         struct bpf_ringbuf_map *rb_map;
352
353         if (unlikely(flags))
354                 return 0;
355
356         rb_map = container_of(map, struct bpf_ringbuf_map, map);
357         return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size);
358 }
359
360 const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
361         .func           = bpf_ringbuf_reserve,
362         .ret_type       = RET_PTR_TO_ALLOC_MEM_OR_NULL,
363         .arg1_type      = ARG_CONST_MAP_PTR,
364         .arg2_type      = ARG_CONST_ALLOC_SIZE_OR_ZERO,
365         .arg3_type      = ARG_ANYTHING,
366 };
367
368 static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
369 {
370         unsigned long rec_pos, cons_pos;
371         struct bpf_ringbuf_hdr *hdr;
372         struct bpf_ringbuf *rb;
373         u32 new_len;
374
375         hdr = sample - BPF_RINGBUF_HDR_SZ;
376         rb = bpf_ringbuf_restore_from_rec(hdr);
377         new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
378         if (discard)
379                 new_len |= BPF_RINGBUF_DISCARD_BIT;
380
381         /* update record header with correct final size prefix */
382         xchg(&hdr->len, new_len);
383
384         /* if consumer caught up and is waiting for our record, notify about
385          * new data availability
386          */
387         rec_pos = (void *)hdr - (void *)rb->data;
388         cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
389
390         if (flags & BPF_RB_FORCE_WAKEUP)
391                 irq_work_queue(&rb->work);
392         else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
393                 irq_work_queue(&rb->work);
394 }
395
396 BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
397 {
398         bpf_ringbuf_commit(sample, flags, false /* discard */);
399         return 0;
400 }
401
402 const struct bpf_func_proto bpf_ringbuf_submit_proto = {
403         .func           = bpf_ringbuf_submit,
404         .ret_type       = RET_VOID,
405         .arg1_type      = ARG_PTR_TO_ALLOC_MEM,
406         .arg2_type      = ARG_ANYTHING,
407 };
408
409 BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
410 {
411         bpf_ringbuf_commit(sample, flags, true /* discard */);
412         return 0;
413 }
414
415 const struct bpf_func_proto bpf_ringbuf_discard_proto = {
416         .func           = bpf_ringbuf_discard,
417         .ret_type       = RET_VOID,
418         .arg1_type      = ARG_PTR_TO_ALLOC_MEM,
419         .arg2_type      = ARG_ANYTHING,
420 };
421
422 BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size,
423            u64, flags)
424 {
425         struct bpf_ringbuf_map *rb_map;
426         void *rec;
427
428         if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP)))
429                 return -EINVAL;
430
431         rb_map = container_of(map, struct bpf_ringbuf_map, map);
432         rec = __bpf_ringbuf_reserve(rb_map->rb, size);
433         if (!rec)
434                 return -EAGAIN;
435
436         memcpy(rec, data, size);
437         bpf_ringbuf_commit(rec, flags, false /* discard */);
438         return 0;
439 }
440
441 const struct bpf_func_proto bpf_ringbuf_output_proto = {
442         .func           = bpf_ringbuf_output,
443         .ret_type       = RET_INTEGER,
444         .arg1_type      = ARG_CONST_MAP_PTR,
445         .arg2_type      = ARG_PTR_TO_MEM,
446         .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
447         .arg4_type      = ARG_ANYTHING,
448 };
449
450 BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
451 {
452         struct bpf_ringbuf *rb;
453
454         rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
455
456         switch (flags) {
457         case BPF_RB_AVAIL_DATA:
458                 return ringbuf_avail_data_sz(rb);
459         case BPF_RB_RING_SIZE:
460                 return rb->mask + 1;
461         case BPF_RB_CONS_POS:
462                 return smp_load_acquire(&rb->consumer_pos);
463         case BPF_RB_PROD_POS:
464                 return smp_load_acquire(&rb->producer_pos);
465         default:
466                 return 0;
467         }
468 }
469
470 const struct bpf_func_proto bpf_ringbuf_query_proto = {
471         .func           = bpf_ringbuf_query,
472         .ret_type       = RET_INTEGER,
473         .arg1_type      = ARG_CONST_MAP_PTR,
474         .arg2_type      = ARG_ANYTHING,
475 };