Merge tag 'mfd-for-linus-3.18' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd
[linux-2.6-microblaze.git] / arch / x86 / kernel / process_64.c
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *      Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *      Andi Kleen.
9  *
10  *      CPU hotplug support - ashok.raj@intel.com
11  */
12
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52
53 asmlinkage extern void ret_from_fork(void);
54
55 __visible DEFINE_PER_CPU(unsigned long, old_rsp);
56
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61         unsigned long d0, d1, d2, d3, d6, d7;
62         unsigned int fsindex, gsindex;
63         unsigned int ds, cs, es;
64
65         printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
66         printk_address(regs->ip);
67         printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
68                         regs->sp, regs->flags);
69         printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
70                regs->ax, regs->bx, regs->cx);
71         printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
72                regs->dx, regs->si, regs->di);
73         printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
74                regs->bp, regs->r8, regs->r9);
75         printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
76                regs->r10, regs->r11, regs->r12);
77         printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
78                regs->r13, regs->r14, regs->r15);
79
80         asm("movl %%ds,%0" : "=r" (ds));
81         asm("movl %%cs,%0" : "=r" (cs));
82         asm("movl %%es,%0" : "=r" (es));
83         asm("movl %%fs,%0" : "=r" (fsindex));
84         asm("movl %%gs,%0" : "=r" (gsindex));
85
86         rdmsrl(MSR_FS_BASE, fs);
87         rdmsrl(MSR_GS_BASE, gs);
88         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
89
90         if (!all)
91                 return;
92
93         cr0 = read_cr0();
94         cr2 = read_cr2();
95         cr3 = read_cr3();
96         cr4 = read_cr4();
97
98         printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
99                fs, fsindex, gs, gsindex, shadowgs);
100         printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
101                         es, cr0);
102         printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
103                         cr4);
104
105         get_debugreg(d0, 0);
106         get_debugreg(d1, 1);
107         get_debugreg(d2, 2);
108         get_debugreg(d3, 3);
109         get_debugreg(d6, 6);
110         get_debugreg(d7, 7);
111
112         /* Only print out debug registers if they are in their non-default state. */
113         if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
114             (d6 == DR6_RESERVED) && (d7 == 0x400))
115                 return;
116
117         printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
118         printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
119
120 }
121
122 void release_thread(struct task_struct *dead_task)
123 {
124         if (dead_task->mm) {
125                 if (dead_task->mm->context.size) {
126                         pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
127                                 dead_task->comm,
128                                 dead_task->mm->context.ldt,
129                                 dead_task->mm->context.size);
130                         BUG();
131                 }
132         }
133 }
134
135 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
136 {
137         struct user_desc ud = {
138                 .base_addr = addr,
139                 .limit = 0xfffff,
140                 .seg_32bit = 1,
141                 .limit_in_pages = 1,
142                 .useable = 1,
143         };
144         struct desc_struct *desc = t->thread.tls_array;
145         desc += tls;
146         fill_ldt(desc, &ud);
147 }
148
149 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
150 {
151         return get_desc_base(&t->thread.tls_array[tls]);
152 }
153
154 int copy_thread(unsigned long clone_flags, unsigned long sp,
155                 unsigned long arg, struct task_struct *p)
156 {
157         int err;
158         struct pt_regs *childregs;
159         struct task_struct *me = current;
160
161         p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
162         childregs = task_pt_regs(p);
163         p->thread.sp = (unsigned long) childregs;
164         p->thread.usersp = me->thread.usersp;
165         set_tsk_thread_flag(p, TIF_FORK);
166         p->thread.io_bitmap_ptr = NULL;
167
168         savesegment(gs, p->thread.gsindex);
169         p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
170         savesegment(fs, p->thread.fsindex);
171         p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
172         savesegment(es, p->thread.es);
173         savesegment(ds, p->thread.ds);
174         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
175
176         if (unlikely(p->flags & PF_KTHREAD)) {
177                 /* kernel thread */
178                 memset(childregs, 0, sizeof(struct pt_regs));
179                 childregs->sp = (unsigned long)childregs;
180                 childregs->ss = __KERNEL_DS;
181                 childregs->bx = sp; /* function */
182                 childregs->bp = arg;
183                 childregs->orig_ax = -1;
184                 childregs->cs = __KERNEL_CS | get_kernel_rpl();
185                 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
186                 return 0;
187         }
188         *childregs = *current_pt_regs();
189
190         childregs->ax = 0;
191         if (sp)
192                 childregs->sp = sp;
193
194         err = -ENOMEM;
195         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
196                 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
197                                                   IO_BITMAP_BYTES, GFP_KERNEL);
198                 if (!p->thread.io_bitmap_ptr) {
199                         p->thread.io_bitmap_max = 0;
200                         return -ENOMEM;
201                 }
202                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
203         }
204
205         /*
206          * Set a new TLS for the child thread?
207          */
208         if (clone_flags & CLONE_SETTLS) {
209 #ifdef CONFIG_IA32_EMULATION
210                 if (test_thread_flag(TIF_IA32))
211                         err = do_set_thread_area(p, -1,
212                                 (struct user_desc __user *)childregs->si, 0);
213                 else
214 #endif
215                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
216                 if (err)
217                         goto out;
218         }
219         err = 0;
220 out:
221         if (err && p->thread.io_bitmap_ptr) {
222                 kfree(p->thread.io_bitmap_ptr);
223                 p->thread.io_bitmap_max = 0;
224         }
225
226         return err;
227 }
228
229 static void
230 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
231                     unsigned long new_sp,
232                     unsigned int _cs, unsigned int _ss, unsigned int _ds)
233 {
234         loadsegment(fs, 0);
235         loadsegment(es, _ds);
236         loadsegment(ds, _ds);
237         load_gs_index(0);
238         current->thread.usersp  = new_sp;
239         regs->ip                = new_ip;
240         regs->sp                = new_sp;
241         this_cpu_write(old_rsp, new_sp);
242         regs->cs                = _cs;
243         regs->ss                = _ss;
244         regs->flags             = X86_EFLAGS_IF;
245 }
246
247 void
248 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
249 {
250         start_thread_common(regs, new_ip, new_sp,
251                             __USER_CS, __USER_DS, 0);
252 }
253
254 #ifdef CONFIG_IA32_EMULATION
255 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
256 {
257         start_thread_common(regs, new_ip, new_sp,
258                             test_thread_flag(TIF_X32)
259                             ? __USER_CS : __USER32_CS,
260                             __USER_DS, __USER_DS);
261 }
262 #endif
263
264 /*
265  *      switch_to(x,y) should switch tasks from x to y.
266  *
267  * This could still be optimized:
268  * - fold all the options into a flag word and test it with a single test.
269  * - could test fs/gs bitsliced
270  *
271  * Kprobes not supported here. Set the probe on schedule instead.
272  * Function graph tracer not supported too.
273  */
274 __visible __notrace_funcgraph struct task_struct *
275 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
276 {
277         struct thread_struct *prev = &prev_p->thread;
278         struct thread_struct *next = &next_p->thread;
279         int cpu = smp_processor_id();
280         struct tss_struct *tss = &per_cpu(init_tss, cpu);
281         unsigned fsindex, gsindex;
282         fpu_switch_t fpu;
283
284         fpu = switch_fpu_prepare(prev_p, next_p, cpu);
285
286         /*
287          * Reload esp0, LDT and the page table pointer:
288          */
289         load_sp0(tss, next);
290
291         /*
292          * Switch DS and ES.
293          * This won't pick up thread selector changes, but I guess that is ok.
294          */
295         savesegment(es, prev->es);
296         if (unlikely(next->es | prev->es))
297                 loadsegment(es, next->es);
298
299         savesegment(ds, prev->ds);
300         if (unlikely(next->ds | prev->ds))
301                 loadsegment(ds, next->ds);
302
303
304         /* We must save %fs and %gs before load_TLS() because
305          * %fs and %gs may be cleared by load_TLS().
306          *
307          * (e.g. xen_load_tls())
308          */
309         savesegment(fs, fsindex);
310         savesegment(gs, gsindex);
311
312         load_TLS(next, cpu);
313
314         /*
315          * Leave lazy mode, flushing any hypercalls made here.
316          * This must be done before restoring TLS segments so
317          * the GDT and LDT are properly updated, and must be
318          * done before math_state_restore, so the TS bit is up
319          * to date.
320          */
321         arch_end_context_switch(next_p);
322
323         /*
324          * Switch FS and GS.
325          *
326          * Segment register != 0 always requires a reload.  Also
327          * reload when it has changed.  When prev process used 64bit
328          * base always reload to avoid an information leak.
329          */
330         if (unlikely(fsindex | next->fsindex | prev->fs)) {
331                 loadsegment(fs, next->fsindex);
332                 /*
333                  * Check if the user used a selector != 0; if yes
334                  *  clear 64bit base, since overloaded base is always
335                  *  mapped to the Null selector
336                  */
337                 if (fsindex)
338                         prev->fs = 0;
339         }
340         /* when next process has a 64bit base use it */
341         if (next->fs)
342                 wrmsrl(MSR_FS_BASE, next->fs);
343         prev->fsindex = fsindex;
344
345         if (unlikely(gsindex | next->gsindex | prev->gs)) {
346                 load_gs_index(next->gsindex);
347                 if (gsindex)
348                         prev->gs = 0;
349         }
350         if (next->gs)
351                 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
352         prev->gsindex = gsindex;
353
354         switch_fpu_finish(next_p, fpu);
355
356         /*
357          * Switch the PDA and FPU contexts.
358          */
359         prev->usersp = this_cpu_read(old_rsp);
360         this_cpu_write(old_rsp, next->usersp);
361         this_cpu_write(current_task, next_p);
362
363         /*
364          * If it were not for PREEMPT_ACTIVE we could guarantee that the
365          * preempt_count of all tasks was equal here and this would not be
366          * needed.
367          */
368         task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
369         this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
370
371         this_cpu_write(kernel_stack,
372                   (unsigned long)task_stack_page(next_p) +
373                   THREAD_SIZE - KERNEL_STACK_OFFSET);
374
375         /*
376          * Now maybe reload the debug registers and handle I/O bitmaps
377          */
378         if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
379                      task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
380                 __switch_to_xtra(prev_p, next_p, tss);
381
382         return prev_p;
383 }
384
385 void set_personality_64bit(void)
386 {
387         /* inherit personality from parent */
388
389         /* Make sure to be in 64bit mode */
390         clear_thread_flag(TIF_IA32);
391         clear_thread_flag(TIF_ADDR32);
392         clear_thread_flag(TIF_X32);
393
394         /* Ensure the corresponding mm is not marked. */
395         if (current->mm)
396                 current->mm->context.ia32_compat = 0;
397
398         /* TBD: overwrites user setup. Should have two bits.
399            But 64bit processes have always behaved this way,
400            so it's not too bad. The main problem is just that
401            32bit childs are affected again. */
402         current->personality &= ~READ_IMPLIES_EXEC;
403 }
404
405 void set_personality_ia32(bool x32)
406 {
407         /* inherit personality from parent */
408
409         /* Make sure to be in 32bit mode */
410         set_thread_flag(TIF_ADDR32);
411
412         /* Mark the associated mm as containing 32-bit tasks. */
413         if (x32) {
414                 clear_thread_flag(TIF_IA32);
415                 set_thread_flag(TIF_X32);
416                 if (current->mm)
417                         current->mm->context.ia32_compat = TIF_X32;
418                 current->personality &= ~READ_IMPLIES_EXEC;
419                 /* is_compat_task() uses the presence of the x32
420                    syscall bit flag to determine compat status */
421                 current_thread_info()->status &= ~TS_COMPAT;
422         } else {
423                 set_thread_flag(TIF_IA32);
424                 clear_thread_flag(TIF_X32);
425                 if (current->mm)
426                         current->mm->context.ia32_compat = TIF_IA32;
427                 current->personality |= force_personality32;
428                 /* Prepare the first "return" to user space */
429                 current_thread_info()->status |= TS_COMPAT;
430         }
431 }
432 EXPORT_SYMBOL_GPL(set_personality_ia32);
433
434 unsigned long get_wchan(struct task_struct *p)
435 {
436         unsigned long stack;
437         u64 fp, ip;
438         int count = 0;
439
440         if (!p || p == current || p->state == TASK_RUNNING)
441                 return 0;
442         stack = (unsigned long)task_stack_page(p);
443         if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
444                 return 0;
445         fp = *(u64 *)(p->thread.sp);
446         do {
447                 if (fp < (unsigned long)stack ||
448                     fp >= (unsigned long)stack+THREAD_SIZE)
449                         return 0;
450                 ip = *(u64 *)(fp+8);
451                 if (!in_sched_functions(ip))
452                         return ip;
453                 fp = *(u64 *)fp;
454         } while (count++ < 16);
455         return 0;
456 }
457
458 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
459 {
460         int ret = 0;
461         int doit = task == current;
462         int cpu;
463
464         switch (code) {
465         case ARCH_SET_GS:
466                 if (addr >= TASK_SIZE_OF(task))
467                         return -EPERM;
468                 cpu = get_cpu();
469                 /* handle small bases via the GDT because that's faster to
470                    switch. */
471                 if (addr <= 0xffffffff) {
472                         set_32bit_tls(task, GS_TLS, addr);
473                         if (doit) {
474                                 load_TLS(&task->thread, cpu);
475                                 load_gs_index(GS_TLS_SEL);
476                         }
477                         task->thread.gsindex = GS_TLS_SEL;
478                         task->thread.gs = 0;
479                 } else {
480                         task->thread.gsindex = 0;
481                         task->thread.gs = addr;
482                         if (doit) {
483                                 load_gs_index(0);
484                                 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
485                         }
486                 }
487                 put_cpu();
488                 break;
489         case ARCH_SET_FS:
490                 /* Not strictly needed for fs, but do it for symmetry
491                    with gs */
492                 if (addr >= TASK_SIZE_OF(task))
493                         return -EPERM;
494                 cpu = get_cpu();
495                 /* handle small bases via the GDT because that's faster to
496                    switch. */
497                 if (addr <= 0xffffffff) {
498                         set_32bit_tls(task, FS_TLS, addr);
499                         if (doit) {
500                                 load_TLS(&task->thread, cpu);
501                                 loadsegment(fs, FS_TLS_SEL);
502                         }
503                         task->thread.fsindex = FS_TLS_SEL;
504                         task->thread.fs = 0;
505                 } else {
506                         task->thread.fsindex = 0;
507                         task->thread.fs = addr;
508                         if (doit) {
509                                 /* set the selector to 0 to not confuse
510                                    __switch_to */
511                                 loadsegment(fs, 0);
512                                 ret = wrmsrl_safe(MSR_FS_BASE, addr);
513                         }
514                 }
515                 put_cpu();
516                 break;
517         case ARCH_GET_FS: {
518                 unsigned long base;
519                 if (task->thread.fsindex == FS_TLS_SEL)
520                         base = read_32bit_tls(task, FS_TLS);
521                 else if (doit)
522                         rdmsrl(MSR_FS_BASE, base);
523                 else
524                         base = task->thread.fs;
525                 ret = put_user(base, (unsigned long __user *)addr);
526                 break;
527         }
528         case ARCH_GET_GS: {
529                 unsigned long base;
530                 unsigned gsindex;
531                 if (task->thread.gsindex == GS_TLS_SEL)
532                         base = read_32bit_tls(task, GS_TLS);
533                 else if (doit) {
534                         savesegment(gs, gsindex);
535                         if (gsindex)
536                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
537                         else
538                                 base = task->thread.gs;
539                 } else
540                         base = task->thread.gs;
541                 ret = put_user(base, (unsigned long __user *)addr);
542                 break;
543         }
544
545         default:
546                 ret = -EINVAL;
547                 break;
548         }
549
550         return ret;
551 }
552
553 long sys_arch_prctl(int code, unsigned long addr)
554 {
555         return do_arch_prctl(current, code, addr);
556 }
557
558 unsigned long KSTK_ESP(struct task_struct *task)
559 {
560         return (test_tsk_thread_flag(task, TIF_IA32)) ?
561                         (task_pt_regs(task)->sp) : ((task)->thread.usersp);
562 }