docs: driver-api: firmware: add driver firmware guidelines. (v3)
[linux-2.6-microblaze.git] / kernel / bpf / task_iter.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2020 Facebook */
3
4 #include <linux/init.h>
5 #include <linux/namei.h>
6 #include <linux/pid_namespace.h>
7 #include <linux/fs.h>
8 #include <linux/fdtable.h>
9 #include <linux/filter.h>
10 #include <linux/btf_ids.h>
11 #include "mmap_unlock_work.h"
12
13 struct bpf_iter_seq_task_common {
14         struct pid_namespace *ns;
15 };
16
17 struct bpf_iter_seq_task_info {
18         /* The first field must be struct bpf_iter_seq_task_common.
19          * this is assumed by {init, fini}_seq_pidns() callback functions.
20          */
21         struct bpf_iter_seq_task_common common;
22         u32 tid;
23 };
24
25 static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
26                                              u32 *tid,
27                                              bool skip_if_dup_files)
28 {
29         struct task_struct *task = NULL;
30         struct pid *pid;
31
32         rcu_read_lock();
33 retry:
34         pid = find_ge_pid(*tid, ns);
35         if (pid) {
36                 *tid = pid_nr_ns(pid, ns);
37                 task = get_pid_task(pid, PIDTYPE_PID);
38                 if (!task) {
39                         ++*tid;
40                         goto retry;
41                 } else if (skip_if_dup_files && !thread_group_leader(task) &&
42                            task->files == task->group_leader->files) {
43                         put_task_struct(task);
44                         task = NULL;
45                         ++*tid;
46                         goto retry;
47                 }
48         }
49         rcu_read_unlock();
50
51         return task;
52 }
53
54 static void *task_seq_start(struct seq_file *seq, loff_t *pos)
55 {
56         struct bpf_iter_seq_task_info *info = seq->private;
57         struct task_struct *task;
58
59         task = task_seq_get_next(info->common.ns, &info->tid, false);
60         if (!task)
61                 return NULL;
62
63         if (*pos == 0)
64                 ++*pos;
65         return task;
66 }
67
68 static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
69 {
70         struct bpf_iter_seq_task_info *info = seq->private;
71         struct task_struct *task;
72
73         ++*pos;
74         ++info->tid;
75         put_task_struct((struct task_struct *)v);
76         task = task_seq_get_next(info->common.ns, &info->tid, false);
77         if (!task)
78                 return NULL;
79
80         return task;
81 }
82
83 struct bpf_iter__task {
84         __bpf_md_ptr(struct bpf_iter_meta *, meta);
85         __bpf_md_ptr(struct task_struct *, task);
86 };
87
88 DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
89
90 static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
91                            bool in_stop)
92 {
93         struct bpf_iter_meta meta;
94         struct bpf_iter__task ctx;
95         struct bpf_prog *prog;
96
97         meta.seq = seq;
98         prog = bpf_iter_get_info(&meta, in_stop);
99         if (!prog)
100                 return 0;
101
102         ctx.meta = &meta;
103         ctx.task = task;
104         return bpf_iter_run_prog(prog, &ctx);
105 }
106
107 static int task_seq_show(struct seq_file *seq, void *v)
108 {
109         return __task_seq_show(seq, v, false);
110 }
111
112 static void task_seq_stop(struct seq_file *seq, void *v)
113 {
114         if (!v)
115                 (void)__task_seq_show(seq, v, true);
116         else
117                 put_task_struct((struct task_struct *)v);
118 }
119
120 static const struct seq_operations task_seq_ops = {
121         .start  = task_seq_start,
122         .next   = task_seq_next,
123         .stop   = task_seq_stop,
124         .show   = task_seq_show,
125 };
126
127 struct bpf_iter_seq_task_file_info {
128         /* The first field must be struct bpf_iter_seq_task_common.
129          * this is assumed by {init, fini}_seq_pidns() callback functions.
130          */
131         struct bpf_iter_seq_task_common common;
132         struct task_struct *task;
133         u32 tid;
134         u32 fd;
135 };
136
137 static struct file *
138 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
139 {
140         struct pid_namespace *ns = info->common.ns;
141         u32 curr_tid = info->tid;
142         struct task_struct *curr_task;
143         unsigned int curr_fd = info->fd;
144
145         /* If this function returns a non-NULL file object,
146          * it held a reference to the task/file.
147          * Otherwise, it does not hold any reference.
148          */
149 again:
150         if (info->task) {
151                 curr_task = info->task;
152                 curr_fd = info->fd;
153         } else {
154                 curr_task = task_seq_get_next(ns, &curr_tid, true);
155                 if (!curr_task) {
156                         info->task = NULL;
157                         info->tid = curr_tid;
158                         return NULL;
159                 }
160
161                 /* set info->task and info->tid */
162                 info->task = curr_task;
163                 if (curr_tid == info->tid) {
164                         curr_fd = info->fd;
165                 } else {
166                         info->tid = curr_tid;
167                         curr_fd = 0;
168                 }
169         }
170
171         rcu_read_lock();
172         for (;; curr_fd++) {
173                 struct file *f;
174                 f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
175                 if (!f)
176                         break;
177                 if (!get_file_rcu(f))
178                         continue;
179
180                 /* set info->fd */
181                 info->fd = curr_fd;
182                 rcu_read_unlock();
183                 return f;
184         }
185
186         /* the current task is done, go to the next task */
187         rcu_read_unlock();
188         put_task_struct(curr_task);
189         info->task = NULL;
190         info->fd = 0;
191         curr_tid = ++(info->tid);
192         goto again;
193 }
194
195 static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
196 {
197         struct bpf_iter_seq_task_file_info *info = seq->private;
198         struct file *file;
199
200         info->task = NULL;
201         file = task_file_seq_get_next(info);
202         if (file && *pos == 0)
203                 ++*pos;
204
205         return file;
206 }
207
208 static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210         struct bpf_iter_seq_task_file_info *info = seq->private;
211
212         ++*pos;
213         ++info->fd;
214         fput((struct file *)v);
215         return task_file_seq_get_next(info);
216 }
217
218 struct bpf_iter__task_file {
219         __bpf_md_ptr(struct bpf_iter_meta *, meta);
220         __bpf_md_ptr(struct task_struct *, task);
221         u32 fd __aligned(8);
222         __bpf_md_ptr(struct file *, file);
223 };
224
225 DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
226                      struct task_struct *task, u32 fd,
227                      struct file *file)
228
229 static int __task_file_seq_show(struct seq_file *seq, struct file *file,
230                                 bool in_stop)
231 {
232         struct bpf_iter_seq_task_file_info *info = seq->private;
233         struct bpf_iter__task_file ctx;
234         struct bpf_iter_meta meta;
235         struct bpf_prog *prog;
236
237         meta.seq = seq;
238         prog = bpf_iter_get_info(&meta, in_stop);
239         if (!prog)
240                 return 0;
241
242         ctx.meta = &meta;
243         ctx.task = info->task;
244         ctx.fd = info->fd;
245         ctx.file = file;
246         return bpf_iter_run_prog(prog, &ctx);
247 }
248
249 static int task_file_seq_show(struct seq_file *seq, void *v)
250 {
251         return __task_file_seq_show(seq, v, false);
252 }
253
254 static void task_file_seq_stop(struct seq_file *seq, void *v)
255 {
256         struct bpf_iter_seq_task_file_info *info = seq->private;
257
258         if (!v) {
259                 (void)__task_file_seq_show(seq, v, true);
260         } else {
261                 fput((struct file *)v);
262                 put_task_struct(info->task);
263                 info->task = NULL;
264         }
265 }
266
267 static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
268 {
269         struct bpf_iter_seq_task_common *common = priv_data;
270
271         common->ns = get_pid_ns(task_active_pid_ns(current));
272         return 0;
273 }
274
275 static void fini_seq_pidns(void *priv_data)
276 {
277         struct bpf_iter_seq_task_common *common = priv_data;
278
279         put_pid_ns(common->ns);
280 }
281
282 static const struct seq_operations task_file_seq_ops = {
283         .start  = task_file_seq_start,
284         .next   = task_file_seq_next,
285         .stop   = task_file_seq_stop,
286         .show   = task_file_seq_show,
287 };
288
289 struct bpf_iter_seq_task_vma_info {
290         /* The first field must be struct bpf_iter_seq_task_common.
291          * this is assumed by {init, fini}_seq_pidns() callback functions.
292          */
293         struct bpf_iter_seq_task_common common;
294         struct task_struct *task;
295         struct vm_area_struct *vma;
296         u32 tid;
297         unsigned long prev_vm_start;
298         unsigned long prev_vm_end;
299 };
300
301 enum bpf_task_vma_iter_find_op {
302         task_vma_iter_first_vma,   /* use mm->mmap */
303         task_vma_iter_next_vma,    /* use curr_vma->vm_next */
304         task_vma_iter_find_vma,    /* use find_vma() to find next vma */
305 };
306
307 static struct vm_area_struct *
308 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
309 {
310         struct pid_namespace *ns = info->common.ns;
311         enum bpf_task_vma_iter_find_op op;
312         struct vm_area_struct *curr_vma;
313         struct task_struct *curr_task;
314         u32 curr_tid = info->tid;
315
316         /* If this function returns a non-NULL vma, it holds a reference to
317          * the task_struct, and holds read lock on vma->mm->mmap_lock.
318          * If this function returns NULL, it does not hold any reference or
319          * lock.
320          */
321         if (info->task) {
322                 curr_task = info->task;
323                 curr_vma = info->vma;
324                 /* In case of lock contention, drop mmap_lock to unblock
325                  * the writer.
326                  *
327                  * After relock, call find(mm, prev_vm_end - 1) to find
328                  * new vma to process.
329                  *
330                  *   +------+------+-----------+
331                  *   | VMA1 | VMA2 | VMA3      |
332                  *   +------+------+-----------+
333                  *   |      |      |           |
334                  *  4k     8k     16k         400k
335                  *
336                  * For example, curr_vma == VMA2. Before unlock, we set
337                  *
338                  *    prev_vm_start = 8k
339                  *    prev_vm_end   = 16k
340                  *
341                  * There are a few cases:
342                  *
343                  * 1) VMA2 is freed, but VMA3 exists.
344                  *
345                  *    find_vma() will return VMA3, just process VMA3.
346                  *
347                  * 2) VMA2 still exists.
348                  *
349                  *    find_vma() will return VMA2, process VMA2->next.
350                  *
351                  * 3) no more vma in this mm.
352                  *
353                  *    Process the next task.
354                  *
355                  * 4) find_vma() returns a different vma, VMA2'.
356                  *
357                  *    4.1) If VMA2 covers same range as VMA2', skip VMA2',
358                  *         because we already covered the range;
359                  *    4.2) VMA2 and VMA2' covers different ranges, process
360                  *         VMA2'.
361                  */
362                 if (mmap_lock_is_contended(curr_task->mm)) {
363                         info->prev_vm_start = curr_vma->vm_start;
364                         info->prev_vm_end = curr_vma->vm_end;
365                         op = task_vma_iter_find_vma;
366                         mmap_read_unlock(curr_task->mm);
367                         if (mmap_read_lock_killable(curr_task->mm))
368                                 goto finish;
369                 } else {
370                         op = task_vma_iter_next_vma;
371                 }
372         } else {
373 again:
374                 curr_task = task_seq_get_next(ns, &curr_tid, true);
375                 if (!curr_task) {
376                         info->tid = curr_tid + 1;
377                         goto finish;
378                 }
379
380                 if (curr_tid != info->tid) {
381                         info->tid = curr_tid;
382                         /* new task, process the first vma */
383                         op = task_vma_iter_first_vma;
384                 } else {
385                         /* Found the same tid, which means the user space
386                          * finished data in previous buffer and read more.
387                          * We dropped mmap_lock before returning to user
388                          * space, so it is necessary to use find_vma() to
389                          * find the next vma to process.
390                          */
391                         op = task_vma_iter_find_vma;
392                 }
393
394                 if (!curr_task->mm)
395                         goto next_task;
396
397                 if (mmap_read_lock_killable(curr_task->mm))
398                         goto finish;
399         }
400
401         switch (op) {
402         case task_vma_iter_first_vma:
403                 curr_vma = curr_task->mm->mmap;
404                 break;
405         case task_vma_iter_next_vma:
406                 curr_vma = curr_vma->vm_next;
407                 break;
408         case task_vma_iter_find_vma:
409                 /* We dropped mmap_lock so it is necessary to use find_vma
410                  * to find the next vma. This is similar to the  mechanism
411                  * in show_smaps_rollup().
412                  */
413                 curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1);
414                 /* case 1) and 4.2) above just use curr_vma */
415
416                 /* check for case 2) or case 4.1) above */
417                 if (curr_vma &&
418                     curr_vma->vm_start == info->prev_vm_start &&
419                     curr_vma->vm_end == info->prev_vm_end)
420                         curr_vma = curr_vma->vm_next;
421                 break;
422         }
423         if (!curr_vma) {
424                 /* case 3) above, or case 2) 4.1) with vma->next == NULL */
425                 mmap_read_unlock(curr_task->mm);
426                 goto next_task;
427         }
428         info->task = curr_task;
429         info->vma = curr_vma;
430         return curr_vma;
431
432 next_task:
433         put_task_struct(curr_task);
434         info->task = NULL;
435         curr_tid++;
436         goto again;
437
438 finish:
439         if (curr_task)
440                 put_task_struct(curr_task);
441         info->task = NULL;
442         info->vma = NULL;
443         return NULL;
444 }
445
446 static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
447 {
448         struct bpf_iter_seq_task_vma_info *info = seq->private;
449         struct vm_area_struct *vma;
450
451         vma = task_vma_seq_get_next(info);
452         if (vma && *pos == 0)
453                 ++*pos;
454
455         return vma;
456 }
457
458 static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
459 {
460         struct bpf_iter_seq_task_vma_info *info = seq->private;
461
462         ++*pos;
463         return task_vma_seq_get_next(info);
464 }
465
466 struct bpf_iter__task_vma {
467         __bpf_md_ptr(struct bpf_iter_meta *, meta);
468         __bpf_md_ptr(struct task_struct *, task);
469         __bpf_md_ptr(struct vm_area_struct *, vma);
470 };
471
472 DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
473                      struct task_struct *task, struct vm_area_struct *vma)
474
475 static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
476 {
477         struct bpf_iter_seq_task_vma_info *info = seq->private;
478         struct bpf_iter__task_vma ctx;
479         struct bpf_iter_meta meta;
480         struct bpf_prog *prog;
481
482         meta.seq = seq;
483         prog = bpf_iter_get_info(&meta, in_stop);
484         if (!prog)
485                 return 0;
486
487         ctx.meta = &meta;
488         ctx.task = info->task;
489         ctx.vma = info->vma;
490         return bpf_iter_run_prog(prog, &ctx);
491 }
492
493 static int task_vma_seq_show(struct seq_file *seq, void *v)
494 {
495         return __task_vma_seq_show(seq, false);
496 }
497
498 static void task_vma_seq_stop(struct seq_file *seq, void *v)
499 {
500         struct bpf_iter_seq_task_vma_info *info = seq->private;
501
502         if (!v) {
503                 (void)__task_vma_seq_show(seq, true);
504         } else {
505                 /* info->vma has not been seen by the BPF program. If the
506                  * user space reads more, task_vma_seq_get_next should
507                  * return this vma again. Set prev_vm_start to ~0UL,
508                  * so that we don't skip the vma returned by the next
509                  * find_vma() (case task_vma_iter_find_vma in
510                  * task_vma_seq_get_next()).
511                  */
512                 info->prev_vm_start = ~0UL;
513                 info->prev_vm_end = info->vma->vm_end;
514                 mmap_read_unlock(info->task->mm);
515                 put_task_struct(info->task);
516                 info->task = NULL;
517         }
518 }
519
520 static const struct seq_operations task_vma_seq_ops = {
521         .start  = task_vma_seq_start,
522         .next   = task_vma_seq_next,
523         .stop   = task_vma_seq_stop,
524         .show   = task_vma_seq_show,
525 };
526
527 static const struct bpf_iter_seq_info task_seq_info = {
528         .seq_ops                = &task_seq_ops,
529         .init_seq_private       = init_seq_pidns,
530         .fini_seq_private       = fini_seq_pidns,
531         .seq_priv_size          = sizeof(struct bpf_iter_seq_task_info),
532 };
533
534 static struct bpf_iter_reg task_reg_info = {
535         .target                 = "task",
536         .feature                = BPF_ITER_RESCHED,
537         .ctx_arg_info_size      = 1,
538         .ctx_arg_info           = {
539                 { offsetof(struct bpf_iter__task, task),
540                   PTR_TO_BTF_ID_OR_NULL },
541         },
542         .seq_info               = &task_seq_info,
543 };
544
545 static const struct bpf_iter_seq_info task_file_seq_info = {
546         .seq_ops                = &task_file_seq_ops,
547         .init_seq_private       = init_seq_pidns,
548         .fini_seq_private       = fini_seq_pidns,
549         .seq_priv_size          = sizeof(struct bpf_iter_seq_task_file_info),
550 };
551
552 static struct bpf_iter_reg task_file_reg_info = {
553         .target                 = "task_file",
554         .feature                = BPF_ITER_RESCHED,
555         .ctx_arg_info_size      = 2,
556         .ctx_arg_info           = {
557                 { offsetof(struct bpf_iter__task_file, task),
558                   PTR_TO_BTF_ID_OR_NULL },
559                 { offsetof(struct bpf_iter__task_file, file),
560                   PTR_TO_BTF_ID_OR_NULL },
561         },
562         .seq_info               = &task_file_seq_info,
563 };
564
565 static const struct bpf_iter_seq_info task_vma_seq_info = {
566         .seq_ops                = &task_vma_seq_ops,
567         .init_seq_private       = init_seq_pidns,
568         .fini_seq_private       = fini_seq_pidns,
569         .seq_priv_size          = sizeof(struct bpf_iter_seq_task_vma_info),
570 };
571
572 static struct bpf_iter_reg task_vma_reg_info = {
573         .target                 = "task_vma",
574         .feature                = BPF_ITER_RESCHED,
575         .ctx_arg_info_size      = 2,
576         .ctx_arg_info           = {
577                 { offsetof(struct bpf_iter__task_vma, task),
578                   PTR_TO_BTF_ID_OR_NULL },
579                 { offsetof(struct bpf_iter__task_vma, vma),
580                   PTR_TO_BTF_ID_OR_NULL },
581         },
582         .seq_info               = &task_vma_seq_info,
583 };
584
585 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
586            bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
587 {
588         struct mmap_unlock_irq_work *work = NULL;
589         struct vm_area_struct *vma;
590         bool irq_work_busy = false;
591         struct mm_struct *mm;
592         int ret = -ENOENT;
593
594         if (flags)
595                 return -EINVAL;
596
597         if (!task)
598                 return -ENOENT;
599
600         mm = task->mm;
601         if (!mm)
602                 return -ENOENT;
603
604         irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
605
606         if (irq_work_busy || !mmap_read_trylock(mm))
607                 return -EBUSY;
608
609         vma = find_vma(mm, start);
610
611         if (vma && vma->vm_start <= start && vma->vm_end > start) {
612                 callback_fn((u64)(long)task, (u64)(long)vma,
613                             (u64)(long)callback_ctx, 0, 0);
614                 ret = 0;
615         }
616         bpf_mmap_unlock_mm(work, mm);
617         return ret;
618 }
619
620 const struct bpf_func_proto bpf_find_vma_proto = {
621         .func           = bpf_find_vma,
622         .ret_type       = RET_INTEGER,
623         .arg1_type      = ARG_PTR_TO_BTF_ID,
624         .arg1_btf_id    = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
625         .arg2_type      = ARG_ANYTHING,
626         .arg3_type      = ARG_PTR_TO_FUNC,
627         .arg4_type      = ARG_PTR_TO_STACK_OR_NULL,
628         .arg5_type      = ARG_ANYTHING,
629 };
630
631 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
632
633 static void do_mmap_read_unlock(struct irq_work *entry)
634 {
635         struct mmap_unlock_irq_work *work;
636
637         if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
638                 return;
639
640         work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
641         mmap_read_unlock_non_owner(work->mm);
642 }
643
644 static int __init task_iter_init(void)
645 {
646         struct mmap_unlock_irq_work *work;
647         int ret, cpu;
648
649         for_each_possible_cpu(cpu) {
650                 work = per_cpu_ptr(&mmap_unlock_work, cpu);
651                 init_irq_work(&work->irq_work, do_mmap_read_unlock);
652         }
653
654         task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
655         ret = bpf_iter_reg_target(&task_reg_info);
656         if (ret)
657                 return ret;
658
659         task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
660         task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
661         ret =  bpf_iter_reg_target(&task_file_reg_info);
662         if (ret)
663                 return ret;
664
665         task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
666         task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
667         return bpf_iter_reg_target(&task_vma_reg_info);
668 }
669 late_initcall(task_iter_init);