593257def7769a4b63d21a1143bcc873c90bd9d8
[linux-2.6-microblaze.git] / arch / x86 / kernel / process_64.c
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *      Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *      Andi Kleen.
9  *
10  *      CPU hotplug support - ashok.raj@intel.com
11  */
12
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52
53 asmlinkage extern void ret_from_fork(void);
54
55 __visible DEFINE_PER_CPU(unsigned long, old_rsp);
56
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61         unsigned long d0, d1, d2, d3, d6, d7;
62         unsigned int fsindex, gsindex;
63         unsigned int ds, cs, es;
64
65         printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
66         printk_address(regs->ip);
67         printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
68                         regs->sp, regs->flags);
69         printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
70                regs->ax, regs->bx, regs->cx);
71         printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
72                regs->dx, regs->si, regs->di);
73         printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
74                regs->bp, regs->r8, regs->r9);
75         printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
76                regs->r10, regs->r11, regs->r12);
77         printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
78                regs->r13, regs->r14, regs->r15);
79
80         asm("movl %%ds,%0" : "=r" (ds));
81         asm("movl %%cs,%0" : "=r" (cs));
82         asm("movl %%es,%0" : "=r" (es));
83         asm("movl %%fs,%0" : "=r" (fsindex));
84         asm("movl %%gs,%0" : "=r" (gsindex));
85
86         rdmsrl(MSR_FS_BASE, fs);
87         rdmsrl(MSR_GS_BASE, gs);
88         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
89
90         if (!all)
91                 return;
92
93         cr0 = read_cr0();
94         cr2 = read_cr2();
95         cr3 = read_cr3();
96         cr4 = read_cr4();
97
98         printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
99                fs, fsindex, gs, gsindex, shadowgs);
100         printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
101                         es, cr0);
102         printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
103                         cr4);
104
105         get_debugreg(d0, 0);
106         get_debugreg(d1, 1);
107         get_debugreg(d2, 2);
108         get_debugreg(d3, 3);
109         get_debugreg(d6, 6);
110         get_debugreg(d7, 7);
111
112         /* Only print out debug registers if they are in their non-default state. */
113         if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
114             (d6 == DR6_RESERVED) && (d7 == 0x400))
115                 return;
116
117         printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
118         printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
119
120 }
121
122 void release_thread(struct task_struct *dead_task)
123 {
124         if (dead_task->mm) {
125                 if (dead_task->mm->context.size) {
126                         pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
127                                 dead_task->comm,
128                                 dead_task->mm->context.ldt,
129                                 dead_task->mm->context.size);
130                         BUG();
131                 }
132         }
133 }
134
135 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
136 {
137         struct user_desc ud = {
138                 .base_addr = addr,
139                 .limit = 0xfffff,
140                 .seg_32bit = 1,
141                 .limit_in_pages = 1,
142                 .useable = 1,
143         };
144         struct desc_struct *desc = t->thread.tls_array;
145         desc += tls;
146         fill_ldt(desc, &ud);
147 }
148
149 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
150 {
151         return get_desc_base(&t->thread.tls_array[tls]);
152 }
153
154 int copy_thread(unsigned long clone_flags, unsigned long sp,
155                 unsigned long arg, struct task_struct *p)
156 {
157         int err;
158         struct pt_regs *childregs;
159         struct task_struct *me = current;
160
161         p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
162         childregs = task_pt_regs(p);
163         p->thread.sp = (unsigned long) childregs;
164         p->thread.usersp = me->thread.usersp;
165         set_tsk_thread_flag(p, TIF_FORK);
166         p->thread.io_bitmap_ptr = NULL;
167
168         savesegment(gs, p->thread.gsindex);
169         p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
170         savesegment(fs, p->thread.fsindex);
171         p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
172         savesegment(es, p->thread.es);
173         savesegment(ds, p->thread.ds);
174         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
175
176         if (unlikely(p->flags & PF_KTHREAD)) {
177                 /* kernel thread */
178                 memset(childregs, 0, sizeof(struct pt_regs));
179                 childregs->sp = (unsigned long)childregs;
180                 childregs->ss = __KERNEL_DS;
181                 childregs->bx = sp; /* function */
182                 childregs->bp = arg;
183                 childregs->orig_ax = -1;
184                 childregs->cs = __KERNEL_CS | get_kernel_rpl();
185                 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
186                 return 0;
187         }
188         *childregs = *current_pt_regs();
189
190         childregs->ax = 0;
191         if (sp)
192                 childregs->sp = sp;
193
194         err = -ENOMEM;
195         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
196
197         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
198                 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
199                                                   IO_BITMAP_BYTES, GFP_KERNEL);
200                 if (!p->thread.io_bitmap_ptr) {
201                         p->thread.io_bitmap_max = 0;
202                         return -ENOMEM;
203                 }
204                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
205         }
206
207         /*
208          * Set a new TLS for the child thread?
209          */
210         if (clone_flags & CLONE_SETTLS) {
211 #ifdef CONFIG_IA32_EMULATION
212                 if (test_thread_flag(TIF_IA32))
213                         err = do_set_thread_area(p, -1,
214                                 (struct user_desc __user *)childregs->si, 0);
215                 else
216 #endif
217                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
218                 if (err)
219                         goto out;
220         }
221         err = 0;
222 out:
223         if (err && p->thread.io_bitmap_ptr) {
224                 kfree(p->thread.io_bitmap_ptr);
225                 p->thread.io_bitmap_max = 0;
226         }
227
228         return err;
229 }
230
231 static void
232 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
233                     unsigned long new_sp,
234                     unsigned int _cs, unsigned int _ss, unsigned int _ds)
235 {
236         loadsegment(fs, 0);
237         loadsegment(es, _ds);
238         loadsegment(ds, _ds);
239         load_gs_index(0);
240         current->thread.usersp  = new_sp;
241         regs->ip                = new_ip;
242         regs->sp                = new_sp;
243         this_cpu_write(old_rsp, new_sp);
244         regs->cs                = _cs;
245         regs->ss                = _ss;
246         regs->flags             = X86_EFLAGS_IF;
247 }
248
249 void
250 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
251 {
252         start_thread_common(regs, new_ip, new_sp,
253                             __USER_CS, __USER_DS, 0);
254 }
255
256 #ifdef CONFIG_IA32_EMULATION
257 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
258 {
259         start_thread_common(regs, new_ip, new_sp,
260                             test_thread_flag(TIF_X32)
261                             ? __USER_CS : __USER32_CS,
262                             __USER_DS, __USER_DS);
263 }
264 #endif
265
266 /*
267  *      switch_to(x,y) should switch tasks from x to y.
268  *
269  * This could still be optimized:
270  * - fold all the options into a flag word and test it with a single test.
271  * - could test fs/gs bitsliced
272  *
273  * Kprobes not supported here. Set the probe on schedule instead.
274  * Function graph tracer not supported too.
275  */
276 __visible __notrace_funcgraph struct task_struct *
277 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
278 {
279         struct thread_struct *prev = &prev_p->thread;
280         struct thread_struct *next = &next_p->thread;
281         int cpu = smp_processor_id();
282         struct tss_struct *tss = &per_cpu(init_tss, cpu);
283         unsigned fsindex, gsindex;
284         fpu_switch_t fpu;
285
286         fpu = switch_fpu_prepare(prev_p, next_p, cpu);
287
288         /*
289          * Reload esp0, LDT and the page table pointer:
290          */
291         load_sp0(tss, next);
292
293         /*
294          * Switch DS and ES.
295          * This won't pick up thread selector changes, but I guess that is ok.
296          */
297         savesegment(es, prev->es);
298         if (unlikely(next->es | prev->es))
299                 loadsegment(es, next->es);
300
301         savesegment(ds, prev->ds);
302         if (unlikely(next->ds | prev->ds))
303                 loadsegment(ds, next->ds);
304
305
306         /* We must save %fs and %gs before load_TLS() because
307          * %fs and %gs may be cleared by load_TLS().
308          *
309          * (e.g. xen_load_tls())
310          */
311         savesegment(fs, fsindex);
312         savesegment(gs, gsindex);
313
314         load_TLS(next, cpu);
315
316         /*
317          * Leave lazy mode, flushing any hypercalls made here.
318          * This must be done before restoring TLS segments so
319          * the GDT and LDT are properly updated, and must be
320          * done before math_state_restore, so the TS bit is up
321          * to date.
322          */
323         arch_end_context_switch(next_p);
324
325         /*
326          * Switch FS and GS.
327          *
328          * Segment register != 0 always requires a reload.  Also
329          * reload when it has changed.  When prev process used 64bit
330          * base always reload to avoid an information leak.
331          */
332         if (unlikely(fsindex | next->fsindex | prev->fs)) {
333                 loadsegment(fs, next->fsindex);
334                 /*
335                  * Check if the user used a selector != 0; if yes
336                  *  clear 64bit base, since overloaded base is always
337                  *  mapped to the Null selector
338                  */
339                 if (fsindex)
340                         prev->fs = 0;
341         }
342         /* when next process has a 64bit base use it */
343         if (next->fs)
344                 wrmsrl(MSR_FS_BASE, next->fs);
345         prev->fsindex = fsindex;
346
347         if (unlikely(gsindex | next->gsindex | prev->gs)) {
348                 load_gs_index(next->gsindex);
349                 if (gsindex)
350                         prev->gs = 0;
351         }
352         if (next->gs)
353                 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
354         prev->gsindex = gsindex;
355
356         switch_fpu_finish(next_p, fpu);
357
358         /*
359          * Switch the PDA and FPU contexts.
360          */
361         prev->usersp = this_cpu_read(old_rsp);
362         this_cpu_write(old_rsp, next->usersp);
363         this_cpu_write(current_task, next_p);
364
365         /*
366          * If it were not for PREEMPT_ACTIVE we could guarantee that the
367          * preempt_count of all tasks was equal here and this would not be
368          * needed.
369          */
370         task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
371         this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
372
373         this_cpu_write(kernel_stack,
374                   (unsigned long)task_stack_page(next_p) +
375                   THREAD_SIZE - KERNEL_STACK_OFFSET);
376
377         /*
378          * Now maybe reload the debug registers and handle I/O bitmaps
379          */
380         if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
381                      task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
382                 __switch_to_xtra(prev_p, next_p, tss);
383
384         return prev_p;
385 }
386
387 void set_personality_64bit(void)
388 {
389         /* inherit personality from parent */
390
391         /* Make sure to be in 64bit mode */
392         clear_thread_flag(TIF_IA32);
393         clear_thread_flag(TIF_ADDR32);
394         clear_thread_flag(TIF_X32);
395
396         /* Ensure the corresponding mm is not marked. */
397         if (current->mm)
398                 current->mm->context.ia32_compat = 0;
399
400         /* TBD: overwrites user setup. Should have two bits.
401            But 64bit processes have always behaved this way,
402            so it's not too bad. The main problem is just that
403            32bit childs are affected again. */
404         current->personality &= ~READ_IMPLIES_EXEC;
405 }
406
407 void set_personality_ia32(bool x32)
408 {
409         /* inherit personality from parent */
410
411         /* Make sure to be in 32bit mode */
412         set_thread_flag(TIF_ADDR32);
413
414         /* Mark the associated mm as containing 32-bit tasks. */
415         if (x32) {
416                 clear_thread_flag(TIF_IA32);
417                 set_thread_flag(TIF_X32);
418                 if (current->mm)
419                         current->mm->context.ia32_compat = TIF_X32;
420                 current->personality &= ~READ_IMPLIES_EXEC;
421                 /* is_compat_task() uses the presence of the x32
422                    syscall bit flag to determine compat status */
423                 current_thread_info()->status &= ~TS_COMPAT;
424         } else {
425                 set_thread_flag(TIF_IA32);
426                 clear_thread_flag(TIF_X32);
427                 if (current->mm)
428                         current->mm->context.ia32_compat = TIF_IA32;
429                 current->personality |= force_personality32;
430                 /* Prepare the first "return" to user space */
431                 current_thread_info()->status |= TS_COMPAT;
432         }
433 }
434 EXPORT_SYMBOL_GPL(set_personality_ia32);
435
436 unsigned long get_wchan(struct task_struct *p)
437 {
438         unsigned long stack;
439         u64 fp, ip;
440         int count = 0;
441
442         if (!p || p == current || p->state == TASK_RUNNING)
443                 return 0;
444         stack = (unsigned long)task_stack_page(p);
445         if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
446                 return 0;
447         fp = *(u64 *)(p->thread.sp);
448         do {
449                 if (fp < (unsigned long)stack ||
450                     fp >= (unsigned long)stack+THREAD_SIZE)
451                         return 0;
452                 ip = *(u64 *)(fp+8);
453                 if (!in_sched_functions(ip))
454                         return ip;
455                 fp = *(u64 *)fp;
456         } while (count++ < 16);
457         return 0;
458 }
459
460 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
461 {
462         int ret = 0;
463         int doit = task == current;
464         int cpu;
465
466         switch (code) {
467         case ARCH_SET_GS:
468                 if (addr >= TASK_SIZE_OF(task))
469                         return -EPERM;
470                 cpu = get_cpu();
471                 /* handle small bases via the GDT because that's faster to
472                    switch. */
473                 if (addr <= 0xffffffff) {
474                         set_32bit_tls(task, GS_TLS, addr);
475                         if (doit) {
476                                 load_TLS(&task->thread, cpu);
477                                 load_gs_index(GS_TLS_SEL);
478                         }
479                         task->thread.gsindex = GS_TLS_SEL;
480                         task->thread.gs = 0;
481                 } else {
482                         task->thread.gsindex = 0;
483                         task->thread.gs = addr;
484                         if (doit) {
485                                 load_gs_index(0);
486                                 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
487                         }
488                 }
489                 put_cpu();
490                 break;
491         case ARCH_SET_FS:
492                 /* Not strictly needed for fs, but do it for symmetry
493                    with gs */
494                 if (addr >= TASK_SIZE_OF(task))
495                         return -EPERM;
496                 cpu = get_cpu();
497                 /* handle small bases via the GDT because that's faster to
498                    switch. */
499                 if (addr <= 0xffffffff) {
500                         set_32bit_tls(task, FS_TLS, addr);
501                         if (doit) {
502                                 load_TLS(&task->thread, cpu);
503                                 loadsegment(fs, FS_TLS_SEL);
504                         }
505                         task->thread.fsindex = FS_TLS_SEL;
506                         task->thread.fs = 0;
507                 } else {
508                         task->thread.fsindex = 0;
509                         task->thread.fs = addr;
510                         if (doit) {
511                                 /* set the selector to 0 to not confuse
512                                    __switch_to */
513                                 loadsegment(fs, 0);
514                                 ret = wrmsrl_safe(MSR_FS_BASE, addr);
515                         }
516                 }
517                 put_cpu();
518                 break;
519         case ARCH_GET_FS: {
520                 unsigned long base;
521                 if (task->thread.fsindex == FS_TLS_SEL)
522                         base = read_32bit_tls(task, FS_TLS);
523                 else if (doit)
524                         rdmsrl(MSR_FS_BASE, base);
525                 else
526                         base = task->thread.fs;
527                 ret = put_user(base, (unsigned long __user *)addr);
528                 break;
529         }
530         case ARCH_GET_GS: {
531                 unsigned long base;
532                 unsigned gsindex;
533                 if (task->thread.gsindex == GS_TLS_SEL)
534                         base = read_32bit_tls(task, GS_TLS);
535                 else if (doit) {
536                         savesegment(gs, gsindex);
537                         if (gsindex)
538                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
539                         else
540                                 base = task->thread.gs;
541                 } else
542                         base = task->thread.gs;
543                 ret = put_user(base, (unsigned long __user *)addr);
544                 break;
545         }
546
547         default:
548                 ret = -EINVAL;
549                 break;
550         }
551
552         return ret;
553 }
554
555 long sys_arch_prctl(int code, unsigned long addr)
556 {
557         return do_arch_prctl(current, code, addr);
558 }
559
560 unsigned long KSTK_ESP(struct task_struct *task)
561 {
562         return (test_tsk_thread_flag(task, TIF_IA32)) ?
563                         (task_pt_regs(task)->sp) : ((task)->thread.usersp);
564 }