Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input
[linux-2.6-microblaze.git] / drivers / vfio / vfio_iommu_spapr_tce.c
1 /*
2  * VFIO: IOMMU DMA mapping support for TCE on POWER
3  *
4  * Copyright (C) 2013 IBM Corp.  All rights reserved.
5  *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  *
11  * Derived from original vfio_iommu_type1.c:
12  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
13  *     Author: Alex Williamson <alex.williamson@redhat.com>
14  */
15
16 #include <linux/module.h>
17 #include <linux/pci.h>
18 #include <linux/slab.h>
19 #include <linux/uaccess.h>
20 #include <linux/err.h>
21 #include <linux/vfio.h>
22 #include <linux/vmalloc.h>
23 #include <linux/sched/mm.h>
24 #include <linux/sched/signal.h>
25
26 #include <asm/iommu.h>
27 #include <asm/tce.h>
28 #include <asm/mmu_context.h>
29
30 #define DRIVER_VERSION  "0.1"
31 #define DRIVER_AUTHOR   "aik@ozlabs.ru"
32 #define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
33
34 static void tce_iommu_detach_group(void *iommu_data,
35                 struct iommu_group *iommu_group);
36
37 static long try_increment_locked_vm(struct mm_struct *mm, long npages)
38 {
39         long ret = 0, locked, lock_limit;
40
41         if (WARN_ON_ONCE(!mm))
42                 return -EPERM;
43
44         if (!npages)
45                 return 0;
46
47         down_write(&mm->mmap_sem);
48         locked = mm->locked_vm + npages;
49         lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
50         if (locked > lock_limit && !capable(CAP_IPC_LOCK))
51                 ret = -ENOMEM;
52         else
53                 mm->locked_vm += npages;
54
55         pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
56                         npages << PAGE_SHIFT,
57                         mm->locked_vm << PAGE_SHIFT,
58                         rlimit(RLIMIT_MEMLOCK),
59                         ret ? " - exceeded" : "");
60
61         up_write(&mm->mmap_sem);
62
63         return ret;
64 }
65
66 static void decrement_locked_vm(struct mm_struct *mm, long npages)
67 {
68         if (!mm || !npages)
69                 return;
70
71         down_write(&mm->mmap_sem);
72         if (WARN_ON_ONCE(npages > mm->locked_vm))
73                 npages = mm->locked_vm;
74         mm->locked_vm -= npages;
75         pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
76                         npages << PAGE_SHIFT,
77                         mm->locked_vm << PAGE_SHIFT,
78                         rlimit(RLIMIT_MEMLOCK));
79         up_write(&mm->mmap_sem);
80 }
81
82 /*
83  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
84  *
85  * This code handles mapping and unmapping of user data buffers
86  * into DMA'ble space using the IOMMU
87  */
88
89 struct tce_iommu_group {
90         struct list_head next;
91         struct iommu_group *grp;
92 };
93
94 /*
95  * A container needs to remember which preregistered region  it has
96  * referenced to do proper cleanup at the userspace process exit.
97  */
98 struct tce_iommu_prereg {
99         struct list_head next;
100         struct mm_iommu_table_group_mem_t *mem;
101 };
102
103 /*
104  * The container descriptor supports only a single group per container.
105  * Required by the API as the container is not supplied with the IOMMU group
106  * at the moment of initialization.
107  */
108 struct tce_container {
109         struct mutex lock;
110         bool enabled;
111         bool v2;
112         bool def_window_pending;
113         unsigned long locked_pages;
114         struct mm_struct *mm;
115         struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
116         struct list_head group_list;
117         struct list_head prereg_list;
118 };
119
120 static long tce_iommu_mm_set(struct tce_container *container)
121 {
122         if (container->mm) {
123                 if (container->mm == current->mm)
124                         return 0;
125                 return -EPERM;
126         }
127         BUG_ON(!current->mm);
128         container->mm = current->mm;
129         atomic_inc(&container->mm->mm_count);
130
131         return 0;
132 }
133
134 static long tce_iommu_prereg_free(struct tce_container *container,
135                 struct tce_iommu_prereg *tcemem)
136 {
137         long ret;
138
139         ret = mm_iommu_put(container->mm, tcemem->mem);
140         if (ret)
141                 return ret;
142
143         list_del(&tcemem->next);
144         kfree(tcemem);
145
146         return 0;
147 }
148
149 static long tce_iommu_unregister_pages(struct tce_container *container,
150                 __u64 vaddr, __u64 size)
151 {
152         struct mm_iommu_table_group_mem_t *mem;
153         struct tce_iommu_prereg *tcemem;
154         bool found = false;
155
156         if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
157                 return -EINVAL;
158
159         mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT);
160         if (!mem)
161                 return -ENOENT;
162
163         list_for_each_entry(tcemem, &container->prereg_list, next) {
164                 if (tcemem->mem == mem) {
165                         found = true;
166                         break;
167                 }
168         }
169
170         if (!found)
171                 return -ENOENT;
172
173         return tce_iommu_prereg_free(container, tcemem);
174 }
175
176 static long tce_iommu_register_pages(struct tce_container *container,
177                 __u64 vaddr, __u64 size)
178 {
179         long ret = 0;
180         struct mm_iommu_table_group_mem_t *mem = NULL;
181         struct tce_iommu_prereg *tcemem;
182         unsigned long entries = size >> PAGE_SHIFT;
183
184         if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
185                         ((vaddr + size) < vaddr))
186                 return -EINVAL;
187
188         mem = mm_iommu_find(container->mm, vaddr, entries);
189         if (mem) {
190                 list_for_each_entry(tcemem, &container->prereg_list, next) {
191                         if (tcemem->mem == mem)
192                                 return -EBUSY;
193                 }
194         }
195
196         ret = mm_iommu_get(container->mm, vaddr, entries, &mem);
197         if (ret)
198                 return ret;
199
200         tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
201         if (!tcemem) {
202                 mm_iommu_put(container->mm, mem);
203                 return -ENOMEM;
204         }
205
206         tcemem->mem = mem;
207         list_add(&tcemem->next, &container->prereg_list);
208
209         container->enabled = true;
210
211         return 0;
212 }
213
214 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
215 {
216         /*
217          * Check that the TCE table granularity is not bigger than the size of
218          * a page we just found. Otherwise the hardware can get access to
219          * a bigger memory chunk that it should.
220          */
221         return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
222 }
223
224 static inline bool tce_groups_attached(struct tce_container *container)
225 {
226         return !list_empty(&container->group_list);
227 }
228
229 static long tce_iommu_find_table(struct tce_container *container,
230                 phys_addr_t ioba, struct iommu_table **ptbl)
231 {
232         long i;
233
234         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
235                 struct iommu_table *tbl = container->tables[i];
236
237                 if (tbl) {
238                         unsigned long entry = ioba >> tbl->it_page_shift;
239                         unsigned long start = tbl->it_offset;
240                         unsigned long end = start + tbl->it_size;
241
242                         if ((start <= entry) && (entry < end)) {
243                                 *ptbl = tbl;
244                                 return i;
245                         }
246                 }
247         }
248
249         return -1;
250 }
251
252 static int tce_iommu_find_free_table(struct tce_container *container)
253 {
254         int i;
255
256         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
257                 if (!container->tables[i])
258                         return i;
259         }
260
261         return -ENOSPC;
262 }
263
264 static int tce_iommu_enable(struct tce_container *container)
265 {
266         int ret = 0;
267         unsigned long locked;
268         struct iommu_table_group *table_group;
269         struct tce_iommu_group *tcegrp;
270
271         if (container->enabled)
272                 return -EBUSY;
273
274         /*
275          * When userspace pages are mapped into the IOMMU, they are effectively
276          * locked memory, so, theoretically, we need to update the accounting
277          * of locked pages on each map and unmap.  For powerpc, the map unmap
278          * paths can be very hot, though, and the accounting would kill
279          * performance, especially since it would be difficult to impossible
280          * to handle the accounting in real mode only.
281          *
282          * To address that, rather than precisely accounting every page, we
283          * instead account for a worst case on locked memory when the iommu is
284          * enabled and disabled.  The worst case upper bound on locked memory
285          * is the size of the whole iommu window, which is usually relatively
286          * small (compared to total memory sizes) on POWER hardware.
287          *
288          * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
289          * that would effectively kill the guest at random points, much better
290          * enforcing the limit based on the max that the guest can map.
291          *
292          * Unfortunately at the moment it counts whole tables, no matter how
293          * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
294          * each with 2GB DMA window, 8GB will be counted here. The reason for
295          * this is that we cannot tell here the amount of RAM used by the guest
296          * as this information is only available from KVM and VFIO is
297          * KVM agnostic.
298          *
299          * So we do not allow enabling a container without a group attached
300          * as there is no way to know how much we should increment
301          * the locked_vm counter.
302          */
303         if (!tce_groups_attached(container))
304                 return -ENODEV;
305
306         tcegrp = list_first_entry(&container->group_list,
307                         struct tce_iommu_group, next);
308         table_group = iommu_group_get_iommudata(tcegrp->grp);
309         if (!table_group)
310                 return -ENODEV;
311
312         if (!table_group->tce32_size)
313                 return -EPERM;
314
315         ret = tce_iommu_mm_set(container);
316         if (ret)
317                 return ret;
318
319         locked = table_group->tce32_size >> PAGE_SHIFT;
320         ret = try_increment_locked_vm(container->mm, locked);
321         if (ret)
322                 return ret;
323
324         container->locked_pages = locked;
325
326         container->enabled = true;
327
328         return ret;
329 }
330
331 static void tce_iommu_disable(struct tce_container *container)
332 {
333         if (!container->enabled)
334                 return;
335
336         container->enabled = false;
337
338         BUG_ON(!container->mm);
339         decrement_locked_vm(container->mm, container->locked_pages);
340 }
341
342 static void *tce_iommu_open(unsigned long arg)
343 {
344         struct tce_container *container;
345
346         if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
347                 pr_err("tce_vfio: Wrong IOMMU type\n");
348                 return ERR_PTR(-EINVAL);
349         }
350
351         container = kzalloc(sizeof(*container), GFP_KERNEL);
352         if (!container)
353                 return ERR_PTR(-ENOMEM);
354
355         mutex_init(&container->lock);
356         INIT_LIST_HEAD_RCU(&container->group_list);
357         INIT_LIST_HEAD_RCU(&container->prereg_list);
358
359         container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
360
361         return container;
362 }
363
364 static int tce_iommu_clear(struct tce_container *container,
365                 struct iommu_table *tbl,
366                 unsigned long entry, unsigned long pages);
367 static void tce_iommu_free_table(struct tce_container *container,
368                 struct iommu_table *tbl);
369
370 static void tce_iommu_release(void *iommu_data)
371 {
372         struct tce_container *container = iommu_data;
373         struct tce_iommu_group *tcegrp;
374         long i;
375
376         while (tce_groups_attached(container)) {
377                 tcegrp = list_first_entry(&container->group_list,
378                                 struct tce_iommu_group, next);
379                 tce_iommu_detach_group(iommu_data, tcegrp->grp);
380         }
381
382         /*
383          * If VFIO created a table, it was not disposed
384          * by tce_iommu_detach_group() so do it now.
385          */
386         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
387                 struct iommu_table *tbl = container->tables[i];
388
389                 if (!tbl)
390                         continue;
391
392                 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
393                 tce_iommu_free_table(container, tbl);
394         }
395
396         while (!list_empty(&container->prereg_list)) {
397                 struct tce_iommu_prereg *tcemem;
398
399                 tcemem = list_first_entry(&container->prereg_list,
400                                 struct tce_iommu_prereg, next);
401                 WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem));
402         }
403
404         tce_iommu_disable(container);
405         if (container->mm)
406                 mmdrop(container->mm);
407         mutex_destroy(&container->lock);
408
409         kfree(container);
410 }
411
412 static void tce_iommu_unuse_page(struct tce_container *container,
413                 unsigned long hpa)
414 {
415         struct page *page;
416
417         page = pfn_to_page(hpa >> PAGE_SHIFT);
418         put_page(page);
419 }
420
421 static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
422                 unsigned long tce, unsigned long shift,
423                 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
424 {
425         long ret = 0;
426         struct mm_iommu_table_group_mem_t *mem;
427
428         mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
429         if (!mem)
430                 return -EINVAL;
431
432         ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
433         if (ret)
434                 return -EINVAL;
435
436         *pmem = mem;
437
438         return 0;
439 }
440
441 static void tce_iommu_unuse_page_v2(struct tce_container *container,
442                 struct iommu_table *tbl, unsigned long entry)
443 {
444         struct mm_iommu_table_group_mem_t *mem = NULL;
445         int ret;
446         unsigned long hpa = 0;
447         __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
448
449         if (!pua)
450                 return;
451
452         ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
453                         tbl->it_page_shift, &hpa, &mem);
454         if (ret)
455                 pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
456                                 __func__, be64_to_cpu(*pua), entry, ret);
457         if (mem)
458                 mm_iommu_mapped_dec(mem);
459
460         *pua = cpu_to_be64(0);
461 }
462
463 static int tce_iommu_clear(struct tce_container *container,
464                 struct iommu_table *tbl,
465                 unsigned long entry, unsigned long pages)
466 {
467         unsigned long oldhpa;
468         long ret;
469         enum dma_data_direction direction;
470
471         for ( ; pages; --pages, ++entry) {
472                 cond_resched();
473
474                 direction = DMA_NONE;
475                 oldhpa = 0;
476                 ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
477                 if (ret)
478                         continue;
479
480                 if (direction == DMA_NONE)
481                         continue;
482
483                 if (container->v2) {
484                         tce_iommu_unuse_page_v2(container, tbl, entry);
485                         continue;
486                 }
487
488                 tce_iommu_unuse_page(container, oldhpa);
489         }
490
491         return 0;
492 }
493
494 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
495 {
496         struct page *page = NULL;
497         enum dma_data_direction direction = iommu_tce_direction(tce);
498
499         if (get_user_pages_fast(tce & PAGE_MASK, 1,
500                         direction != DMA_TO_DEVICE, &page) != 1)
501                 return -EFAULT;
502
503         *hpa = __pa((unsigned long) page_address(page));
504
505         return 0;
506 }
507
508 static long tce_iommu_build(struct tce_container *container,
509                 struct iommu_table *tbl,
510                 unsigned long entry, unsigned long tce, unsigned long pages,
511                 enum dma_data_direction direction)
512 {
513         long i, ret = 0;
514         struct page *page;
515         unsigned long hpa;
516         enum dma_data_direction dirtmp;
517
518         for (i = 0; i < pages; ++i) {
519                 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
520
521                 ret = tce_iommu_use_page(tce, &hpa);
522                 if (ret)
523                         break;
524
525                 page = pfn_to_page(hpa >> PAGE_SHIFT);
526                 if (!tce_page_is_contained(page, tbl->it_page_shift)) {
527                         ret = -EPERM;
528                         break;
529                 }
530
531                 hpa |= offset;
532                 dirtmp = direction;
533                 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
534                 if (ret) {
535                         tce_iommu_unuse_page(container, hpa);
536                         pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
537                                         __func__, entry << tbl->it_page_shift,
538                                         tce, ret);
539                         break;
540                 }
541
542                 if (dirtmp != DMA_NONE)
543                         tce_iommu_unuse_page(container, hpa);
544
545                 tce += IOMMU_PAGE_SIZE(tbl);
546         }
547
548         if (ret)
549                 tce_iommu_clear(container, tbl, entry, i);
550
551         return ret;
552 }
553
554 static long tce_iommu_build_v2(struct tce_container *container,
555                 struct iommu_table *tbl,
556                 unsigned long entry, unsigned long tce, unsigned long pages,
557                 enum dma_data_direction direction)
558 {
559         long i, ret = 0;
560         struct page *page;
561         unsigned long hpa;
562         enum dma_data_direction dirtmp;
563
564         for (i = 0; i < pages; ++i) {
565                 struct mm_iommu_table_group_mem_t *mem = NULL;
566                 __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
567
568                 ret = tce_iommu_prereg_ua_to_hpa(container,
569                                 tce, tbl->it_page_shift, &hpa, &mem);
570                 if (ret)
571                         break;
572
573                 page = pfn_to_page(hpa >> PAGE_SHIFT);
574                 if (!tce_page_is_contained(page, tbl->it_page_shift)) {
575                         ret = -EPERM;
576                         break;
577                 }
578
579                 /* Preserve offset within IOMMU page */
580                 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
581                 dirtmp = direction;
582
583                 /* The registered region is being unregistered */
584                 if (mm_iommu_mapped_inc(mem))
585                         break;
586
587                 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
588                 if (ret) {
589                         /* dirtmp cannot be DMA_NONE here */
590                         tce_iommu_unuse_page_v2(container, tbl, entry + i);
591                         pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
592                                         __func__, entry << tbl->it_page_shift,
593                                         tce, ret);
594                         break;
595                 }
596
597                 if (dirtmp != DMA_NONE)
598                         tce_iommu_unuse_page_v2(container, tbl, entry + i);
599
600                 *pua = cpu_to_be64(tce);
601
602                 tce += IOMMU_PAGE_SIZE(tbl);
603         }
604
605         if (ret)
606                 tce_iommu_clear(container, tbl, entry, i);
607
608         return ret;
609 }
610
611 static long tce_iommu_create_table(struct tce_container *container,
612                         struct iommu_table_group *table_group,
613                         int num,
614                         __u32 page_shift,
615                         __u64 window_size,
616                         __u32 levels,
617                         struct iommu_table **ptbl)
618 {
619         long ret, table_size;
620
621         table_size = table_group->ops->get_table_size(page_shift, window_size,
622                         levels);
623         if (!table_size)
624                 return -EINVAL;
625
626         ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
627         if (ret)
628                 return ret;
629
630         ret = table_group->ops->create_table(table_group, num,
631                         page_shift, window_size, levels, ptbl);
632
633         WARN_ON(!ret && !(*ptbl)->it_ops->free);
634         WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
635
636         return ret;
637 }
638
639 static void tce_iommu_free_table(struct tce_container *container,
640                 struct iommu_table *tbl)
641 {
642         unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
643
644         iommu_tce_table_put(tbl);
645         decrement_locked_vm(container->mm, pages);
646 }
647
648 static long tce_iommu_create_window(struct tce_container *container,
649                 __u32 page_shift, __u64 window_size, __u32 levels,
650                 __u64 *start_addr)
651 {
652         struct tce_iommu_group *tcegrp;
653         struct iommu_table_group *table_group;
654         struct iommu_table *tbl = NULL;
655         long ret, num;
656
657         num = tce_iommu_find_free_table(container);
658         if (num < 0)
659                 return num;
660
661         /* Get the first group for ops::create_table */
662         tcegrp = list_first_entry(&container->group_list,
663                         struct tce_iommu_group, next);
664         table_group = iommu_group_get_iommudata(tcegrp->grp);
665         if (!table_group)
666                 return -EFAULT;
667
668         if (!(table_group->pgsizes & (1ULL << page_shift)))
669                 return -EINVAL;
670
671         if (!table_group->ops->set_window || !table_group->ops->unset_window ||
672                         !table_group->ops->get_table_size ||
673                         !table_group->ops->create_table)
674                 return -EPERM;
675
676         /* Create TCE table */
677         ret = tce_iommu_create_table(container, table_group, num,
678                         page_shift, window_size, levels, &tbl);
679         if (ret)
680                 return ret;
681
682         BUG_ON(!tbl->it_ops->free);
683
684         /*
685          * Program the table to every group.
686          * Groups have been tested for compatibility at the attach time.
687          */
688         list_for_each_entry(tcegrp, &container->group_list, next) {
689                 table_group = iommu_group_get_iommudata(tcegrp->grp);
690
691                 ret = table_group->ops->set_window(table_group, num, tbl);
692                 if (ret)
693                         goto unset_exit;
694         }
695
696         container->tables[num] = tbl;
697
698         /* Return start address assigned by platform in create_table() */
699         *start_addr = tbl->it_offset << tbl->it_page_shift;
700
701         return 0;
702
703 unset_exit:
704         list_for_each_entry(tcegrp, &container->group_list, next) {
705                 table_group = iommu_group_get_iommudata(tcegrp->grp);
706                 table_group->ops->unset_window(table_group, num);
707         }
708         tce_iommu_free_table(container, tbl);
709
710         return ret;
711 }
712
713 static long tce_iommu_remove_window(struct tce_container *container,
714                 __u64 start_addr)
715 {
716         struct iommu_table_group *table_group = NULL;
717         struct iommu_table *tbl;
718         struct tce_iommu_group *tcegrp;
719         int num;
720
721         num = tce_iommu_find_table(container, start_addr, &tbl);
722         if (num < 0)
723                 return -EINVAL;
724
725         BUG_ON(!tbl->it_size);
726
727         /* Detach groups from IOMMUs */
728         list_for_each_entry(tcegrp, &container->group_list, next) {
729                 table_group = iommu_group_get_iommudata(tcegrp->grp);
730
731                 /*
732                  * SPAPR TCE IOMMU exposes the default DMA window to
733                  * the guest via dma32_window_start/size of
734                  * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
735                  * the userspace to remove this window, some do not so
736                  * here we check for the platform capability.
737                  */
738                 if (!table_group->ops || !table_group->ops->unset_window)
739                         return -EPERM;
740
741                 table_group->ops->unset_window(table_group, num);
742         }
743
744         /* Free table */
745         tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
746         tce_iommu_free_table(container, tbl);
747         container->tables[num] = NULL;
748
749         return 0;
750 }
751
752 static long tce_iommu_create_default_window(struct tce_container *container)
753 {
754         long ret;
755         __u64 start_addr = 0;
756         struct tce_iommu_group *tcegrp;
757         struct iommu_table_group *table_group;
758
759         if (!container->def_window_pending)
760                 return 0;
761
762         if (!tce_groups_attached(container))
763                 return -ENODEV;
764
765         tcegrp = list_first_entry(&container->group_list,
766                         struct tce_iommu_group, next);
767         table_group = iommu_group_get_iommudata(tcegrp->grp);
768         if (!table_group)
769                 return -ENODEV;
770
771         ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
772                         table_group->tce32_size, 1, &start_addr);
773         WARN_ON_ONCE(!ret && start_addr);
774
775         if (!ret)
776                 container->def_window_pending = false;
777
778         return ret;
779 }
780
781 static long tce_iommu_ioctl(void *iommu_data,
782                                  unsigned int cmd, unsigned long arg)
783 {
784         struct tce_container *container = iommu_data;
785         unsigned long minsz, ddwsz;
786         long ret;
787
788         switch (cmd) {
789         case VFIO_CHECK_EXTENSION:
790                 switch (arg) {
791                 case VFIO_SPAPR_TCE_IOMMU:
792                 case VFIO_SPAPR_TCE_v2_IOMMU:
793                         ret = 1;
794                         break;
795                 default:
796                         ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
797                         break;
798                 }
799
800                 return (ret < 0) ? 0 : ret;
801         }
802
803         /*
804          * Sanity check to prevent one userspace from manipulating
805          * another userspace mm.
806          */
807         BUG_ON(!container);
808         if (container->mm && container->mm != current->mm)
809                 return -EPERM;
810
811         switch (cmd) {
812         case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
813                 struct vfio_iommu_spapr_tce_info info;
814                 struct tce_iommu_group *tcegrp;
815                 struct iommu_table_group *table_group;
816
817                 if (!tce_groups_attached(container))
818                         return -ENXIO;
819
820                 tcegrp = list_first_entry(&container->group_list,
821                                 struct tce_iommu_group, next);
822                 table_group = iommu_group_get_iommudata(tcegrp->grp);
823
824                 if (!table_group)
825                         return -ENXIO;
826
827                 minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
828                                 dma32_window_size);
829
830                 if (copy_from_user(&info, (void __user *)arg, minsz))
831                         return -EFAULT;
832
833                 if (info.argsz < minsz)
834                         return -EINVAL;
835
836                 info.dma32_window_start = table_group->tce32_start;
837                 info.dma32_window_size = table_group->tce32_size;
838                 info.flags = 0;
839                 memset(&info.ddw, 0, sizeof(info.ddw));
840
841                 if (table_group->max_dynamic_windows_supported &&
842                                 container->v2) {
843                         info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
844                         info.ddw.pgsizes = table_group->pgsizes;
845                         info.ddw.max_dynamic_windows_supported =
846                                 table_group->max_dynamic_windows_supported;
847                         info.ddw.levels = table_group->max_levels;
848                 }
849
850                 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
851
852                 if (info.argsz >= ddwsz)
853                         minsz = ddwsz;
854
855                 if (copy_to_user((void __user *)arg, &info, minsz))
856                         return -EFAULT;
857
858                 return 0;
859         }
860         case VFIO_IOMMU_MAP_DMA: {
861                 struct vfio_iommu_type1_dma_map param;
862                 struct iommu_table *tbl = NULL;
863                 long num;
864                 enum dma_data_direction direction;
865
866                 if (!container->enabled)
867                         return -EPERM;
868
869                 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
870
871                 if (copy_from_user(&param, (void __user *)arg, minsz))
872                         return -EFAULT;
873
874                 if (param.argsz < minsz)
875                         return -EINVAL;
876
877                 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
878                                 VFIO_DMA_MAP_FLAG_WRITE))
879                         return -EINVAL;
880
881                 ret = tce_iommu_create_default_window(container);
882                 if (ret)
883                         return ret;
884
885                 num = tce_iommu_find_table(container, param.iova, &tbl);
886                 if (num < 0)
887                         return -ENXIO;
888
889                 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
890                                 (param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
891                         return -EINVAL;
892
893                 /* iova is checked by the IOMMU API */
894                 if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
895                         if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
896                                 direction = DMA_BIDIRECTIONAL;
897                         else
898                                 direction = DMA_TO_DEVICE;
899                 } else {
900                         if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
901                                 direction = DMA_FROM_DEVICE;
902                         else
903                                 return -EINVAL;
904                 }
905
906                 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
907                 if (ret)
908                         return ret;
909
910                 if (container->v2)
911                         ret = tce_iommu_build_v2(container, tbl,
912                                         param.iova >> tbl->it_page_shift,
913                                         param.vaddr,
914                                         param.size >> tbl->it_page_shift,
915                                         direction);
916                 else
917                         ret = tce_iommu_build(container, tbl,
918                                         param.iova >> tbl->it_page_shift,
919                                         param.vaddr,
920                                         param.size >> tbl->it_page_shift,
921                                         direction);
922
923                 iommu_flush_tce(tbl);
924
925                 return ret;
926         }
927         case VFIO_IOMMU_UNMAP_DMA: {
928                 struct vfio_iommu_type1_dma_unmap param;
929                 struct iommu_table *tbl = NULL;
930                 long num;
931
932                 if (!container->enabled)
933                         return -EPERM;
934
935                 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
936                                 size);
937
938                 if (copy_from_user(&param, (void __user *)arg, minsz))
939                         return -EFAULT;
940
941                 if (param.argsz < minsz)
942                         return -EINVAL;
943
944                 /* No flag is supported now */
945                 if (param.flags)
946                         return -EINVAL;
947
948                 ret = tce_iommu_create_default_window(container);
949                 if (ret)
950                         return ret;
951
952                 num = tce_iommu_find_table(container, param.iova, &tbl);
953                 if (num < 0)
954                         return -ENXIO;
955
956                 if (param.size & ~IOMMU_PAGE_MASK(tbl))
957                         return -EINVAL;
958
959                 ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
960                                 param.size >> tbl->it_page_shift);
961                 if (ret)
962                         return ret;
963
964                 ret = tce_iommu_clear(container, tbl,
965                                 param.iova >> tbl->it_page_shift,
966                                 param.size >> tbl->it_page_shift);
967                 iommu_flush_tce(tbl);
968
969                 return ret;
970         }
971         case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
972                 struct vfio_iommu_spapr_register_memory param;
973
974                 if (!container->v2)
975                         break;
976
977                 minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
978                                 size);
979
980                 ret = tce_iommu_mm_set(container);
981                 if (ret)
982                         return ret;
983
984                 if (copy_from_user(&param, (void __user *)arg, minsz))
985                         return -EFAULT;
986
987                 if (param.argsz < minsz)
988                         return -EINVAL;
989
990                 /* No flag is supported now */
991                 if (param.flags)
992                         return -EINVAL;
993
994                 mutex_lock(&container->lock);
995                 ret = tce_iommu_register_pages(container, param.vaddr,
996                                 param.size);
997                 mutex_unlock(&container->lock);
998
999                 return ret;
1000         }
1001         case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
1002                 struct vfio_iommu_spapr_register_memory param;
1003
1004                 if (!container->v2)
1005                         break;
1006
1007                 if (!container->mm)
1008                         return -EPERM;
1009
1010                 minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1011                                 size);
1012
1013                 if (copy_from_user(&param, (void __user *)arg, minsz))
1014                         return -EFAULT;
1015
1016                 if (param.argsz < minsz)
1017                         return -EINVAL;
1018
1019                 /* No flag is supported now */
1020                 if (param.flags)
1021                         return -EINVAL;
1022
1023                 mutex_lock(&container->lock);
1024                 ret = tce_iommu_unregister_pages(container, param.vaddr,
1025                                 param.size);
1026                 mutex_unlock(&container->lock);
1027
1028                 return ret;
1029         }
1030         case VFIO_IOMMU_ENABLE:
1031                 if (container->v2)
1032                         break;
1033
1034                 mutex_lock(&container->lock);
1035                 ret = tce_iommu_enable(container);
1036                 mutex_unlock(&container->lock);
1037                 return ret;
1038
1039
1040         case VFIO_IOMMU_DISABLE:
1041                 if (container->v2)
1042                         break;
1043
1044                 mutex_lock(&container->lock);
1045                 tce_iommu_disable(container);
1046                 mutex_unlock(&container->lock);
1047                 return 0;
1048
1049         case VFIO_EEH_PE_OP: {
1050                 struct tce_iommu_group *tcegrp;
1051
1052                 ret = 0;
1053                 list_for_each_entry(tcegrp, &container->group_list, next) {
1054                         ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1055                                         cmd, arg);
1056                         if (ret)
1057                                 return ret;
1058                 }
1059                 return ret;
1060         }
1061
1062         case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1063                 struct vfio_iommu_spapr_tce_create create;
1064
1065                 if (!container->v2)
1066                         break;
1067
1068                 ret = tce_iommu_mm_set(container);
1069                 if (ret)
1070                         return ret;
1071
1072                 if (!tce_groups_attached(container))
1073                         return -ENXIO;
1074
1075                 minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1076                                 start_addr);
1077
1078                 if (copy_from_user(&create, (void __user *)arg, minsz))
1079                         return -EFAULT;
1080
1081                 if (create.argsz < minsz)
1082                         return -EINVAL;
1083
1084                 if (create.flags)
1085                         return -EINVAL;
1086
1087                 mutex_lock(&container->lock);
1088
1089                 ret = tce_iommu_create_default_window(container);
1090                 if (!ret)
1091                         ret = tce_iommu_create_window(container,
1092                                         create.page_shift,
1093                                         create.window_size, create.levels,
1094                                         &create.start_addr);
1095
1096                 mutex_unlock(&container->lock);
1097
1098                 if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1099                         ret = -EFAULT;
1100
1101                 return ret;
1102         }
1103         case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1104                 struct vfio_iommu_spapr_tce_remove remove;
1105
1106                 if (!container->v2)
1107                         break;
1108
1109                 ret = tce_iommu_mm_set(container);
1110                 if (ret)
1111                         return ret;
1112
1113                 if (!tce_groups_attached(container))
1114                         return -ENXIO;
1115
1116                 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1117                                 start_addr);
1118
1119                 if (copy_from_user(&remove, (void __user *)arg, minsz))
1120                         return -EFAULT;
1121
1122                 if (remove.argsz < minsz)
1123                         return -EINVAL;
1124
1125                 if (remove.flags)
1126                         return -EINVAL;
1127
1128                 if (container->def_window_pending && !remove.start_addr) {
1129                         container->def_window_pending = false;
1130                         return 0;
1131                 }
1132
1133                 mutex_lock(&container->lock);
1134
1135                 ret = tce_iommu_remove_window(container, remove.start_addr);
1136
1137                 mutex_unlock(&container->lock);
1138
1139                 return ret;
1140         }
1141         }
1142
1143         return -ENOTTY;
1144 }
1145
1146 static void tce_iommu_release_ownership(struct tce_container *container,
1147                 struct iommu_table_group *table_group)
1148 {
1149         int i;
1150
1151         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1152                 struct iommu_table *tbl = container->tables[i];
1153
1154                 if (!tbl)
1155                         continue;
1156
1157                 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1158                 if (tbl->it_map)
1159                         iommu_release_ownership(tbl);
1160
1161                 container->tables[i] = NULL;
1162         }
1163 }
1164
1165 static int tce_iommu_take_ownership(struct tce_container *container,
1166                 struct iommu_table_group *table_group)
1167 {
1168         int i, j, rc = 0;
1169
1170         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1171                 struct iommu_table *tbl = table_group->tables[i];
1172
1173                 if (!tbl || !tbl->it_map)
1174                         continue;
1175
1176                 rc = iommu_take_ownership(tbl);
1177                 if (rc) {
1178                         for (j = 0; j < i; ++j)
1179                                 iommu_release_ownership(
1180                                                 table_group->tables[j]);
1181
1182                         return rc;
1183                 }
1184         }
1185
1186         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1187                 container->tables[i] = table_group->tables[i];
1188
1189         return 0;
1190 }
1191
1192 static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1193                 struct iommu_table_group *table_group)
1194 {
1195         long i;
1196
1197         if (!table_group->ops->unset_window) {
1198                 WARN_ON_ONCE(1);
1199                 return;
1200         }
1201
1202         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1203                 table_group->ops->unset_window(table_group, i);
1204
1205         table_group->ops->release_ownership(table_group);
1206 }
1207
1208 static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1209                 struct iommu_table_group *table_group)
1210 {
1211         long i, ret = 0;
1212
1213         if (!table_group->ops->create_table || !table_group->ops->set_window ||
1214                         !table_group->ops->release_ownership) {
1215                 WARN_ON_ONCE(1);
1216                 return -EFAULT;
1217         }
1218
1219         table_group->ops->take_ownership(table_group);
1220
1221         /* Set all windows to the new group */
1222         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1223                 struct iommu_table *tbl = container->tables[i];
1224
1225                 if (!tbl)
1226                         continue;
1227
1228                 ret = table_group->ops->set_window(table_group, i, tbl);
1229                 if (ret)
1230                         goto release_exit;
1231         }
1232
1233         return 0;
1234
1235 release_exit:
1236         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1237                 table_group->ops->unset_window(table_group, i);
1238
1239         table_group->ops->release_ownership(table_group);
1240
1241         return ret;
1242 }
1243
1244 static int tce_iommu_attach_group(void *iommu_data,
1245                 struct iommu_group *iommu_group)
1246 {
1247         int ret;
1248         struct tce_container *container = iommu_data;
1249         struct iommu_table_group *table_group;
1250         struct tce_iommu_group *tcegrp = NULL;
1251
1252         mutex_lock(&container->lock);
1253
1254         /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1255                         iommu_group_id(iommu_group), iommu_group); */
1256         table_group = iommu_group_get_iommudata(iommu_group);
1257         if (!table_group) {
1258                 ret = -ENODEV;
1259                 goto unlock_exit;
1260         }
1261
1262         if (tce_groups_attached(container) && (!table_group->ops ||
1263                         !table_group->ops->take_ownership ||
1264                         !table_group->ops->release_ownership)) {
1265                 ret = -EBUSY;
1266                 goto unlock_exit;
1267         }
1268
1269         /* Check if new group has the same iommu_ops (i.e. compatible) */
1270         list_for_each_entry(tcegrp, &container->group_list, next) {
1271                 struct iommu_table_group *table_group_tmp;
1272
1273                 if (tcegrp->grp == iommu_group) {
1274                         pr_warn("tce_vfio: Group %d is already attached\n",
1275                                         iommu_group_id(iommu_group));
1276                         ret = -EBUSY;
1277                         goto unlock_exit;
1278                 }
1279                 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1280                 if (table_group_tmp->ops->create_table !=
1281                                 table_group->ops->create_table) {
1282                         pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1283                                         iommu_group_id(iommu_group),
1284                                         iommu_group_id(tcegrp->grp));
1285                         ret = -EPERM;
1286                         goto unlock_exit;
1287                 }
1288         }
1289
1290         tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1291         if (!tcegrp) {
1292                 ret = -ENOMEM;
1293                 goto unlock_exit;
1294         }
1295
1296         if (!table_group->ops || !table_group->ops->take_ownership ||
1297                         !table_group->ops->release_ownership) {
1298                 if (container->v2) {
1299                         ret = -EPERM;
1300                         goto unlock_exit;
1301                 }
1302                 ret = tce_iommu_take_ownership(container, table_group);
1303         } else {
1304                 if (!container->v2) {
1305                         ret = -EPERM;
1306                         goto unlock_exit;
1307                 }
1308                 ret = tce_iommu_take_ownership_ddw(container, table_group);
1309                 if (!tce_groups_attached(container) && !container->tables[0])
1310                         container->def_window_pending = true;
1311         }
1312
1313         if (!ret) {
1314                 tcegrp->grp = iommu_group;
1315                 list_add(&tcegrp->next, &container->group_list);
1316         }
1317
1318 unlock_exit:
1319         if (ret && tcegrp)
1320                 kfree(tcegrp);
1321
1322         mutex_unlock(&container->lock);
1323
1324         return ret;
1325 }
1326
1327 static void tce_iommu_detach_group(void *iommu_data,
1328                 struct iommu_group *iommu_group)
1329 {
1330         struct tce_container *container = iommu_data;
1331         struct iommu_table_group *table_group;
1332         bool found = false;
1333         struct tce_iommu_group *tcegrp;
1334
1335         mutex_lock(&container->lock);
1336
1337         list_for_each_entry(tcegrp, &container->group_list, next) {
1338                 if (tcegrp->grp == iommu_group) {
1339                         found = true;
1340                         break;
1341                 }
1342         }
1343
1344         if (!found) {
1345                 pr_warn("tce_vfio: detaching unattached group #%u\n",
1346                                 iommu_group_id(iommu_group));
1347                 goto unlock_exit;
1348         }
1349
1350         list_del(&tcegrp->next);
1351         kfree(tcegrp);
1352
1353         table_group = iommu_group_get_iommudata(iommu_group);
1354         BUG_ON(!table_group);
1355
1356         if (!table_group->ops || !table_group->ops->release_ownership)
1357                 tce_iommu_release_ownership(container, table_group);
1358         else
1359                 tce_iommu_release_ownership_ddw(container, table_group);
1360
1361 unlock_exit:
1362         mutex_unlock(&container->lock);
1363 }
1364
1365 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1366         .name           = "iommu-vfio-powerpc",
1367         .owner          = THIS_MODULE,
1368         .open           = tce_iommu_open,
1369         .release        = tce_iommu_release,
1370         .ioctl          = tce_iommu_ioctl,
1371         .attach_group   = tce_iommu_attach_group,
1372         .detach_group   = tce_iommu_detach_group,
1373 };
1374
1375 static int __init tce_iommu_init(void)
1376 {
1377         return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1378 }
1379
1380 static void __exit tce_iommu_cleanup(void)
1381 {
1382         vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1383 }
1384
1385 module_init(tce_iommu_init);
1386 module_exit(tce_iommu_cleanup);
1387
1388 MODULE_VERSION(DRIVER_VERSION);
1389 MODULE_LICENSE("GPL v2");
1390 MODULE_AUTHOR(DRIVER_AUTHOR);
1391 MODULE_DESCRIPTION(DRIVER_DESC);
1392