ptrace: Migrate TIF_SYSCALL_EMU to use SYSCALL_WORK flag
[linux-2.6-microblaze.git] / kernel / entry / common.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/context_tracking.h>
4 #include <linux/entry-common.h>
5 #include <linux/livepatch.h>
6 #include <linux/audit.h>
7
8 #define CREATE_TRACE_POINTS
9 #include <trace/events/syscalls.h>
10
11 /**
12  * enter_from_user_mode - Establish state when coming from user mode
13  *
14  * Syscall/interrupt entry disables interrupts, but user mode is traced as
15  * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
16  *
17  * 1) Tell lockdep that interrupts are disabled
18  * 2) Invoke context tracking if enabled to reactivate RCU
19  * 3) Trace interrupts off state
20  */
21 static __always_inline void enter_from_user_mode(struct pt_regs *regs)
22 {
23         arch_check_user_regs(regs);
24         lockdep_hardirqs_off(CALLER_ADDR0);
25
26         CT_WARN_ON(ct_state() != CONTEXT_USER);
27         user_exit_irqoff();
28
29         instrumentation_begin();
30         trace_hardirqs_off_finish();
31         instrumentation_end();
32 }
33
34 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
35 {
36         if (unlikely(audit_context())) {
37                 unsigned long args[6];
38
39                 syscall_get_arguments(current, regs, args);
40                 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
41         }
42 }
43
44 static long syscall_trace_enter(struct pt_regs *regs, long syscall,
45                                 unsigned long ti_work, unsigned long work)
46 {
47         long ret = 0;
48
49         /* Handle ptrace */
50         if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
51                 ret = arch_syscall_enter_tracehook(regs);
52                 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
53                         return -1L;
54         }
55
56         /* Do seccomp after ptrace, to catch any tracer changes. */
57         if (work & SYSCALL_WORK_SECCOMP) {
58                 ret = __secure_computing(NULL);
59                 if (ret == -1L)
60                         return ret;
61         }
62
63         /* Either of the above might have changed the syscall number */
64         syscall = syscall_get_nr(current, regs);
65
66         if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
67                 trace_sys_enter(regs, syscall);
68
69         syscall_enter_audit(regs, syscall);
70
71         return ret ? : syscall;
72 }
73
74 static __always_inline long
75 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
76 {
77         unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
78         unsigned long ti_work;
79
80         ti_work = READ_ONCE(current_thread_info()->flags);
81         if (work & SYSCALL_WORK_ENTER || ti_work & SYSCALL_ENTER_WORK)
82                 syscall = syscall_trace_enter(regs, syscall, ti_work, work);
83
84         return syscall;
85 }
86
87 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
88 {
89         return __syscall_enter_from_user_work(regs, syscall);
90 }
91
92 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
93 {
94         long ret;
95
96         enter_from_user_mode(regs);
97
98         instrumentation_begin();
99         local_irq_enable();
100         ret = __syscall_enter_from_user_work(regs, syscall);
101         instrumentation_end();
102
103         return ret;
104 }
105
106 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
107 {
108         enter_from_user_mode(regs);
109         instrumentation_begin();
110         local_irq_enable();
111         instrumentation_end();
112 }
113
114 /**
115  * exit_to_user_mode - Fixup state when exiting to user mode
116  *
117  * Syscall/interupt exit enables interrupts, but the kernel state is
118  * interrupts disabled when this is invoked. Also tell RCU about it.
119  *
120  * 1) Trace interrupts on state
121  * 2) Invoke context tracking if enabled to adjust RCU state
122  * 3) Invoke architecture specific last minute exit code, e.g. speculation
123  *    mitigations, etc.
124  * 4) Tell lockdep that interrupts are enabled
125  */
126 static __always_inline void exit_to_user_mode(void)
127 {
128         instrumentation_begin();
129         trace_hardirqs_on_prepare();
130         lockdep_hardirqs_on_prepare(CALLER_ADDR0);
131         instrumentation_end();
132
133         user_enter_irqoff();
134         arch_exit_to_user_mode();
135         lockdep_hardirqs_on(CALLER_ADDR0);
136 }
137
138 /* Workaround to allow gradual conversion of architecture code */
139 void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
140
141 static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
142 {
143         if (ti_work & _TIF_NOTIFY_SIGNAL)
144                 tracehook_notify_signal();
145
146         arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
147 }
148
149 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
150                                             unsigned long ti_work)
151 {
152         /*
153          * Before returning to user space ensure that all pending work
154          * items have been completed.
155          */
156         while (ti_work & EXIT_TO_USER_MODE_WORK) {
157
158                 local_irq_enable_exit_to_user(ti_work);
159
160                 if (ti_work & _TIF_NEED_RESCHED)
161                         schedule();
162
163                 if (ti_work & _TIF_UPROBE)
164                         uprobe_notify_resume(regs);
165
166                 if (ti_work & _TIF_PATCH_PENDING)
167                         klp_update_patch_state(current);
168
169                 if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
170                         handle_signal_work(regs, ti_work);
171
172                 if (ti_work & _TIF_NOTIFY_RESUME) {
173                         tracehook_notify_resume(regs);
174                         rseq_handle_notify_resume(NULL, regs);
175                 }
176
177                 /* Architecture specific TIF work */
178                 arch_exit_to_user_mode_work(regs, ti_work);
179
180                 /*
181                  * Disable interrupts and reevaluate the work flags as they
182                  * might have changed while interrupts and preemption was
183                  * enabled above.
184                  */
185                 local_irq_disable_exit_to_user();
186                 ti_work = READ_ONCE(current_thread_info()->flags);
187         }
188
189         /* Return the latest work state for arch_exit_to_user_mode() */
190         return ti_work;
191 }
192
193 static void exit_to_user_mode_prepare(struct pt_regs *regs)
194 {
195         unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
196
197         lockdep_assert_irqs_disabled();
198
199         if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
200                 ti_work = exit_to_user_mode_loop(regs, ti_work);
201
202         arch_exit_to_user_mode_prepare(regs, ti_work);
203
204         /* Ensure that the address limit is intact and no locks are held */
205         addr_limit_user_check();
206         lockdep_assert_irqs_disabled();
207         lockdep_sys_exit();
208 }
209
210 #ifndef _TIF_SINGLESTEP
211 static inline bool report_single_step(unsigned long work)
212 {
213         return false;
214 }
215 #else
216 /*
217  * If SYSCALL_EMU is set, then the only reason to report is when
218  * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
219  * instruction has been already reported in syscall_enter_from_user_mode().
220  */
221 static inline bool report_single_step(unsigned long work)
222 {
223         if (!(work & SYSCALL_WORK_SYSCALL_EMU))
224                 return false;
225
226         return !!(current_thread_info()->flags & _TIF_SINGLESTEP);
227 }
228 #endif
229
230 static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work,
231                               unsigned long work)
232 {
233         bool step;
234
235         audit_syscall_exit(regs);
236
237         if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
238                 trace_sys_exit(regs, syscall_get_return_value(current, regs));
239
240         step = report_single_step(work);
241         if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
242                 arch_syscall_exit_tracehook(regs, step);
243 }
244
245 /*
246  * Syscall specific exit to user mode preparation. Runs with interrupts
247  * enabled.
248  */
249 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
250 {
251         unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
252         u32 cached_flags = READ_ONCE(current_thread_info()->flags);
253         unsigned long nr = syscall_get_nr(current, regs);
254
255         CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
256
257         if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
258                 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
259                         local_irq_enable();
260         }
261
262         rseq_syscall(regs);
263
264         /*
265          * Do one-time syscall specific work. If these work items are
266          * enabled, we want to run them exactly once per syscall exit with
267          * interrupts enabled.
268          */
269         if (unlikely(work & SYSCALL_WORK_EXIT || cached_flags & SYSCALL_EXIT_WORK))
270                 syscall_exit_work(regs, cached_flags, work);
271 }
272
273 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
274 {
275         instrumentation_begin();
276         syscall_exit_to_user_mode_prepare(regs);
277         local_irq_disable_exit_to_user();
278         exit_to_user_mode_prepare(regs);
279         instrumentation_end();
280         exit_to_user_mode();
281 }
282
283 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
284 {
285         enter_from_user_mode(regs);
286 }
287
288 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
289 {
290         instrumentation_begin();
291         exit_to_user_mode_prepare(regs);
292         instrumentation_end();
293         exit_to_user_mode();
294 }
295
296 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
297 {
298         irqentry_state_t ret = {
299                 .exit_rcu = false,
300         };
301
302         if (user_mode(regs)) {
303                 irqentry_enter_from_user_mode(regs);
304                 return ret;
305         }
306
307         /*
308          * If this entry hit the idle task invoke rcu_irq_enter() whether
309          * RCU is watching or not.
310          *
311          * Interrupts can nest when the first interrupt invokes softirq
312          * processing on return which enables interrupts.
313          *
314          * Scheduler ticks in the idle task can mark quiescent state and
315          * terminate a grace period, if and only if the timer interrupt is
316          * not nested into another interrupt.
317          *
318          * Checking for rcu_is_watching() here would prevent the nesting
319          * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
320          * the tick then rcu_flavor_sched_clock_irq() would wrongfully
321          * assume that it is the first interupt and eventually claim
322          * quiescent state and end grace periods prematurely.
323          *
324          * Unconditionally invoke rcu_irq_enter() so RCU state stays
325          * consistent.
326          *
327          * TINY_RCU does not support EQS, so let the compiler eliminate
328          * this part when enabled.
329          */
330         if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
331                 /*
332                  * If RCU is not watching then the same careful
333                  * sequence vs. lockdep and tracing is required
334                  * as in irqentry_enter_from_user_mode().
335                  */
336                 lockdep_hardirqs_off(CALLER_ADDR0);
337                 rcu_irq_enter();
338                 instrumentation_begin();
339                 trace_hardirqs_off_finish();
340                 instrumentation_end();
341
342                 ret.exit_rcu = true;
343                 return ret;
344         }
345
346         /*
347          * If RCU is watching then RCU only wants to check whether it needs
348          * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
349          * already contains a warning when RCU is not watching, so no point
350          * in having another one here.
351          */
352         lockdep_hardirqs_off(CALLER_ADDR0);
353         instrumentation_begin();
354         rcu_irq_enter_check_tick();
355         trace_hardirqs_off_finish();
356         instrumentation_end();
357
358         return ret;
359 }
360
361 void irqentry_exit_cond_resched(void)
362 {
363         if (!preempt_count()) {
364                 /* Sanity check RCU and thread stack */
365                 rcu_irq_exit_check_preempt();
366                 if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
367                         WARN_ON_ONCE(!on_thread_stack());
368                 if (need_resched())
369                         preempt_schedule_irq();
370         }
371 }
372
373 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
374 {
375         lockdep_assert_irqs_disabled();
376
377         /* Check whether this returns to user mode */
378         if (user_mode(regs)) {
379                 irqentry_exit_to_user_mode(regs);
380         } else if (!regs_irqs_disabled(regs)) {
381                 /*
382                  * If RCU was not watching on entry this needs to be done
383                  * carefully and needs the same ordering of lockdep/tracing
384                  * and RCU as the return to user mode path.
385                  */
386                 if (state.exit_rcu) {
387                         instrumentation_begin();
388                         /* Tell the tracer that IRET will enable interrupts */
389                         trace_hardirqs_on_prepare();
390                         lockdep_hardirqs_on_prepare(CALLER_ADDR0);
391                         instrumentation_end();
392                         rcu_irq_exit();
393                         lockdep_hardirqs_on(CALLER_ADDR0);
394                         return;
395                 }
396
397                 instrumentation_begin();
398                 if (IS_ENABLED(CONFIG_PREEMPTION))
399                         irqentry_exit_cond_resched();
400                 /* Covers both tracing and lockdep */
401                 trace_hardirqs_on();
402                 instrumentation_end();
403         } else {
404                 /*
405                  * IRQ flags state is correct already. Just tell RCU if it
406                  * was not watching on entry.
407                  */
408                 if (state.exit_rcu)
409                         rcu_irq_exit();
410         }
411 }
412
413 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
414 {
415         irqentry_state_t irq_state;
416
417         irq_state.lockdep = lockdep_hardirqs_enabled();
418
419         __nmi_enter();
420         lockdep_hardirqs_off(CALLER_ADDR0);
421         lockdep_hardirq_enter();
422         rcu_nmi_enter();
423
424         instrumentation_begin();
425         trace_hardirqs_off_finish();
426         ftrace_nmi_enter();
427         instrumentation_end();
428
429         return irq_state;
430 }
431
432 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
433 {
434         instrumentation_begin();
435         ftrace_nmi_exit();
436         if (irq_state.lockdep) {
437                 trace_hardirqs_on_prepare();
438                 lockdep_hardirqs_on_prepare(CALLER_ADDR0);
439         }
440         instrumentation_end();
441
442         rcu_nmi_exit();
443         lockdep_hardirq_exit();
444         if (irq_state.lockdep)
445                 lockdep_hardirqs_on(CALLER_ADDR0);
446         __nmi_exit();
447 }