Merge tag 'audit-pr-20201012' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoor...
[linux-2.6-microblaze.git] / drivers / iommu / intel / svm.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2015 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>
6  */
7
8 #include <linux/intel-iommu.h>
9 #include <linux/mmu_notifier.h>
10 #include <linux/sched.h>
11 #include <linux/sched/mm.h>
12 #include <linux/slab.h>
13 #include <linux/intel-svm.h>
14 #include <linux/rculist.h>
15 #include <linux/pci.h>
16 #include <linux/pci-ats.h>
17 #include <linux/dmar.h>
18 #include <linux/interrupt.h>
19 #include <linux/mm_types.h>
20 #include <linux/ioasid.h>
21 #include <asm/page.h>
22 #include <asm/fpu/api.h>
23
24 #include "pasid.h"
25
26 static irqreturn_t prq_event_thread(int irq, void *d);
27 static void intel_svm_drain_prq(struct device *dev, u32 pasid);
28
29 #define PRQ_ORDER 0
30
31 int intel_svm_enable_prq(struct intel_iommu *iommu)
32 {
33         struct page *pages;
34         int irq, ret;
35
36         pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
37         if (!pages) {
38                 pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
39                         iommu->name);
40                 return -ENOMEM;
41         }
42         iommu->prq = page_address(pages);
43
44         irq = dmar_alloc_hwirq(DMAR_UNITS_SUPPORTED + iommu->seq_id, iommu->node, iommu);
45         if (irq <= 0) {
46                 pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n",
47                        iommu->name);
48                 ret = -EINVAL;
49         err:
50                 free_pages((unsigned long)iommu->prq, PRQ_ORDER);
51                 iommu->prq = NULL;
52                 return ret;
53         }
54         iommu->pr_irq = irq;
55
56         snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id);
57
58         ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT,
59                                    iommu->prq_name, iommu);
60         if (ret) {
61                 pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n",
62                        iommu->name);
63                 dmar_free_hwirq(irq);
64                 iommu->pr_irq = 0;
65                 goto err;
66         }
67         dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
68         dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
69         dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER);
70
71         init_completion(&iommu->prq_complete);
72
73         return 0;
74 }
75
76 int intel_svm_finish_prq(struct intel_iommu *iommu)
77 {
78         dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
79         dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
80         dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
81
82         if (iommu->pr_irq) {
83                 free_irq(iommu->pr_irq, iommu);
84                 dmar_free_hwirq(iommu->pr_irq);
85                 iommu->pr_irq = 0;
86         }
87
88         free_pages((unsigned long)iommu->prq, PRQ_ORDER);
89         iommu->prq = NULL;
90
91         return 0;
92 }
93
94 static inline bool intel_svm_capable(struct intel_iommu *iommu)
95 {
96         return iommu->flags & VTD_FLAG_SVM_CAPABLE;
97 }
98
99 void intel_svm_check(struct intel_iommu *iommu)
100 {
101         if (!pasid_supported(iommu))
102                 return;
103
104         if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
105             !cap_fl1gp_support(iommu->cap)) {
106                 pr_err("%s SVM disabled, incompatible 1GB page capability\n",
107                        iommu->name);
108                 return;
109         }
110
111         if (cpu_feature_enabled(X86_FEATURE_LA57) &&
112             !cap_5lp_support(iommu->cap)) {
113                 pr_err("%s SVM disabled, incompatible paging mode\n",
114                        iommu->name);
115                 return;
116         }
117
118         iommu->flags |= VTD_FLAG_SVM_CAPABLE;
119 }
120
121 static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev,
122                                 unsigned long address, unsigned long pages, int ih)
123 {
124         struct qi_desc desc;
125
126         if (pages == -1) {
127                 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
128                         QI_EIOTLB_DID(sdev->did) |
129                         QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
130                         QI_EIOTLB_TYPE;
131                 desc.qw1 = 0;
132         } else {
133                 int mask = ilog2(__roundup_pow_of_two(pages));
134
135                 desc.qw0 = QI_EIOTLB_PASID(svm->pasid) |
136                                 QI_EIOTLB_DID(sdev->did) |
137                                 QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) |
138                                 QI_EIOTLB_TYPE;
139                 desc.qw1 = QI_EIOTLB_ADDR(address) |
140                                 QI_EIOTLB_IH(ih) |
141                                 QI_EIOTLB_AM(mask);
142         }
143         desc.qw2 = 0;
144         desc.qw3 = 0;
145         qi_submit_sync(svm->iommu, &desc, 1, 0);
146
147         if (sdev->dev_iotlb) {
148                 desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
149                                 QI_DEV_EIOTLB_SID(sdev->sid) |
150                                 QI_DEV_EIOTLB_QDEP(sdev->qdep) |
151                                 QI_DEIOTLB_TYPE;
152                 if (pages == -1) {
153                         desc.qw1 = QI_DEV_EIOTLB_ADDR(-1ULL >> 1) |
154                                         QI_DEV_EIOTLB_SIZE;
155                 } else if (pages > 1) {
156                         /* The least significant zero bit indicates the size. So,
157                          * for example, an "address" value of 0x12345f000 will
158                          * flush from 0x123440000 to 0x12347ffff (256KiB). */
159                         unsigned long last = address + ((unsigned long)(pages - 1) << VTD_PAGE_SHIFT);
160                         unsigned long mask = __rounddown_pow_of_two(address ^ last);
161
162                         desc.qw1 = QI_DEV_EIOTLB_ADDR((address & ~mask) |
163                                         (mask - 1)) | QI_DEV_EIOTLB_SIZE;
164                 } else {
165                         desc.qw1 = QI_DEV_EIOTLB_ADDR(address);
166                 }
167                 desc.qw2 = 0;
168                 desc.qw3 = 0;
169                 qi_submit_sync(svm->iommu, &desc, 1, 0);
170         }
171 }
172
173 static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address,
174                                 unsigned long pages, int ih)
175 {
176         struct intel_svm_dev *sdev;
177
178         rcu_read_lock();
179         list_for_each_entry_rcu(sdev, &svm->devs, list)
180                 intel_flush_svm_range_dev(svm, sdev, address, pages, ih);
181         rcu_read_unlock();
182 }
183
184 /* Pages have been freed at this point */
185 static void intel_invalidate_range(struct mmu_notifier *mn,
186                                    struct mm_struct *mm,
187                                    unsigned long start, unsigned long end)
188 {
189         struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
190
191         intel_flush_svm_range(svm, start,
192                               (end - start + PAGE_SIZE - 1) >> VTD_PAGE_SHIFT, 0);
193 }
194
195 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
196 {
197         struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
198         struct intel_svm_dev *sdev;
199
200         /* This might end up being called from exit_mmap(), *before* the page
201          * tables are cleared. And __mmu_notifier_release() will delete us from
202          * the list of notifiers so that our invalidate_range() callback doesn't
203          * get called when the page tables are cleared. So we need to protect
204          * against hardware accessing those page tables.
205          *
206          * We do it by clearing the entry in the PASID table and then flushing
207          * the IOTLB and the PASID table caches. This might upset hardware;
208          * perhaps we'll want to point the PASID to a dummy PGD (like the zero
209          * page) so that we end up taking a fault that the hardware really
210          * *has* to handle gracefully without affecting other processes.
211          */
212         rcu_read_lock();
213         list_for_each_entry_rcu(sdev, &svm->devs, list)
214                 intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
215                                             svm->pasid, true);
216         rcu_read_unlock();
217
218 }
219
220 static const struct mmu_notifier_ops intel_mmuops = {
221         .release = intel_mm_release,
222         .invalidate_range = intel_invalidate_range,
223 };
224
225 static DEFINE_MUTEX(pasid_mutex);
226 static LIST_HEAD(global_svm_list);
227
228 #define for_each_svm_dev(sdev, svm, d)                  \
229         list_for_each_entry((sdev), &(svm)->devs, list) \
230                 if ((d) != (sdev)->dev) {} else
231
232 static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid,
233                              struct intel_svm **rsvm,
234                              struct intel_svm_dev **rsdev)
235 {
236         struct intel_svm_dev *d, *sdev = NULL;
237         struct intel_svm *svm;
238
239         /* The caller should hold the pasid_mutex lock */
240         if (WARN_ON(!mutex_is_locked(&pasid_mutex)))
241                 return -EINVAL;
242
243         if (pasid == INVALID_IOASID || pasid >= PASID_MAX)
244                 return -EINVAL;
245
246         svm = ioasid_find(NULL, pasid, NULL);
247         if (IS_ERR(svm))
248                 return PTR_ERR(svm);
249
250         if (!svm)
251                 goto out;
252
253         /*
254          * If we found svm for the PASID, there must be at least one device
255          * bond.
256          */
257         if (WARN_ON(list_empty(&svm->devs)))
258                 return -EINVAL;
259
260         rcu_read_lock();
261         list_for_each_entry_rcu(d, &svm->devs, list) {
262                 if (d->dev == dev) {
263                         sdev = d;
264                         break;
265                 }
266         }
267         rcu_read_unlock();
268
269 out:
270         *rsvm = svm;
271         *rsdev = sdev;
272
273         return 0;
274 }
275
276 int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
277                           struct iommu_gpasid_bind_data *data)
278 {
279         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
280         struct intel_svm_dev *sdev = NULL;
281         struct dmar_domain *dmar_domain;
282         struct intel_svm *svm = NULL;
283         int ret = 0;
284
285         if (WARN_ON(!iommu) || !data)
286                 return -EINVAL;
287
288         if (data->version != IOMMU_GPASID_BIND_VERSION_1 ||
289             data->format != IOMMU_PASID_FORMAT_INTEL_VTD)
290                 return -EINVAL;
291
292         if (!dev_is_pci(dev))
293                 return -ENOTSUPP;
294
295         /* VT-d supports devices with full 20 bit PASIDs only */
296         if (pci_max_pasids(to_pci_dev(dev)) != PASID_MAX)
297                 return -EINVAL;
298
299         /*
300          * We only check host PASID range, we have no knowledge to check
301          * guest PASID range.
302          */
303         if (data->hpasid <= 0 || data->hpasid >= PASID_MAX)
304                 return -EINVAL;
305
306         dmar_domain = to_dmar_domain(domain);
307
308         mutex_lock(&pasid_mutex);
309         ret = pasid_to_svm_sdev(dev, data->hpasid, &svm, &sdev);
310         if (ret)
311                 goto out;
312
313         if (sdev) {
314                 /*
315                  * Do not allow multiple bindings of the same device-PASID since
316                  * there is only one SL page tables per PASID. We may revisit
317                  * once sharing PGD across domains are supported.
318                  */
319                 dev_warn_ratelimited(dev, "Already bound with PASID %u\n",
320                                      svm->pasid);
321                 ret = -EBUSY;
322                 goto out;
323         }
324
325         if (!svm) {
326                 /* We come here when PASID has never been bond to a device. */
327                 svm = kzalloc(sizeof(*svm), GFP_KERNEL);
328                 if (!svm) {
329                         ret = -ENOMEM;
330                         goto out;
331                 }
332                 /* REVISIT: upper layer/VFIO can track host process that bind
333                  * the PASID. ioasid_set = mm might be sufficient for vfio to
334                  * check pasid VMM ownership. We can drop the following line
335                  * once VFIO and IOASID set check is in place.
336                  */
337                 svm->mm = get_task_mm(current);
338                 svm->pasid = data->hpasid;
339                 if (data->flags & IOMMU_SVA_GPASID_VAL) {
340                         svm->gpasid = data->gpasid;
341                         svm->flags |= SVM_FLAG_GUEST_PASID;
342                 }
343                 ioasid_set_data(data->hpasid, svm);
344                 INIT_LIST_HEAD_RCU(&svm->devs);
345                 mmput(svm->mm);
346         }
347         sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
348         if (!sdev) {
349                 ret = -ENOMEM;
350                 goto out;
351         }
352         sdev->dev = dev;
353
354         /* Only count users if device has aux domains */
355         if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
356                 sdev->users = 1;
357
358         /* Set up device context entry for PASID if not enabled already */
359         ret = intel_iommu_enable_pasid(iommu, sdev->dev);
360         if (ret) {
361                 dev_err_ratelimited(dev, "Failed to enable PASID capability\n");
362                 kfree(sdev);
363                 goto out;
364         }
365
366         /*
367          * PASID table is per device for better security. Therefore, for
368          * each bind of a new device even with an existing PASID, we need to
369          * call the nested mode setup function here.
370          */
371         spin_lock(&iommu->lock);
372         ret = intel_pasid_setup_nested(iommu, dev,
373                                        (pgd_t *)(uintptr_t)data->gpgd,
374                                        data->hpasid, &data->vtd, dmar_domain,
375                                        data->addr_width);
376         spin_unlock(&iommu->lock);
377         if (ret) {
378                 dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n",
379                                     data->hpasid, ret);
380                 /*
381                  * PASID entry should be in cleared state if nested mode
382                  * set up failed. So we only need to clear IOASID tracking
383                  * data such that free call will succeed.
384                  */
385                 kfree(sdev);
386                 goto out;
387         }
388
389         svm->flags |= SVM_FLAG_GUEST_MODE;
390
391         init_rcu_head(&sdev->rcu);
392         list_add_rcu(&sdev->list, &svm->devs);
393  out:
394         if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) {
395                 ioasid_set_data(data->hpasid, NULL);
396                 kfree(svm);
397         }
398
399         mutex_unlock(&pasid_mutex);
400         return ret;
401 }
402
403 int intel_svm_unbind_gpasid(struct device *dev, u32 pasid)
404 {
405         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
406         struct intel_svm_dev *sdev;
407         struct intel_svm *svm;
408         int ret;
409
410         if (WARN_ON(!iommu))
411                 return -EINVAL;
412
413         mutex_lock(&pasid_mutex);
414         ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
415         if (ret)
416                 goto out;
417
418         if (sdev) {
419                 if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
420                         sdev->users--;
421                 if (!sdev->users) {
422                         list_del_rcu(&sdev->list);
423                         intel_pasid_tear_down_entry(iommu, dev,
424                                                     svm->pasid, false);
425                         intel_svm_drain_prq(dev, svm->pasid);
426                         kfree_rcu(sdev, rcu);
427
428                         if (list_empty(&svm->devs)) {
429                                 /*
430                                  * We do not free the IOASID here in that
431                                  * IOMMU driver did not allocate it.
432                                  * Unlike native SVM, IOASID for guest use was
433                                  * allocated prior to the bind call.
434                                  * In any case, if the free call comes before
435                                  * the unbind, IOMMU driver will get notified
436                                  * and perform cleanup.
437                                  */
438                                 ioasid_set_data(pasid, NULL);
439                                 kfree(svm);
440                         }
441                 }
442         }
443 out:
444         mutex_unlock(&pasid_mutex);
445         return ret;
446 }
447
448 static void _load_pasid(void *unused)
449 {
450         update_pasid();
451 }
452
453 static void load_pasid(struct mm_struct *mm, u32 pasid)
454 {
455         mutex_lock(&mm->context.lock);
456
457         /* Synchronize with READ_ONCE in update_pasid(). */
458         smp_store_release(&mm->pasid, pasid);
459
460         /* Update PASID MSR on all CPUs running the mm's tasks. */
461         on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
462
463         mutex_unlock(&mm->context.lock);
464 }
465
466 /* Caller must hold pasid_mutex, mm reference */
467 static int
468 intel_svm_bind_mm(struct device *dev, unsigned int flags,
469                   struct svm_dev_ops *ops,
470                   struct mm_struct *mm, struct intel_svm_dev **sd)
471 {
472         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
473         struct device_domain_info *info;
474         struct intel_svm_dev *sdev;
475         struct intel_svm *svm = NULL;
476         int pasid_max;
477         int ret;
478
479         if (!iommu || dmar_disabled)
480                 return -EINVAL;
481
482         if (!intel_svm_capable(iommu))
483                 return -ENOTSUPP;
484
485         if (dev_is_pci(dev)) {
486                 pasid_max = pci_max_pasids(to_pci_dev(dev));
487                 if (pasid_max < 0)
488                         return -EINVAL;
489         } else
490                 pasid_max = 1 << 20;
491
492         /* Bind supervisor PASID shuld have mm = NULL */
493         if (flags & SVM_FLAG_SUPERVISOR_MODE) {
494                 if (!ecap_srs(iommu->ecap) || mm) {
495                         pr_err("Supervisor PASID with user provided mm.\n");
496                         return -EINVAL;
497                 }
498         }
499
500         if (!(flags & SVM_FLAG_PRIVATE_PASID)) {
501                 struct intel_svm *t;
502
503                 list_for_each_entry(t, &global_svm_list, list) {
504                         if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
505                                 continue;
506
507                         svm = t;
508                         if (svm->pasid >= pasid_max) {
509                                 dev_warn(dev,
510                                          "Limited PASID width. Cannot use existing PASID %d\n",
511                                          svm->pasid);
512                                 ret = -ENOSPC;
513                                 goto out;
514                         }
515
516                         /* Find the matching device in svm list */
517                         for_each_svm_dev(sdev, svm, dev) {
518                                 if (sdev->ops != ops) {
519                                         ret = -EBUSY;
520                                         goto out;
521                                 }
522                                 sdev->users++;
523                                 goto success;
524                         }
525
526                         break;
527                 }
528         }
529
530         sdev = kzalloc(sizeof(*sdev), GFP_KERNEL);
531         if (!sdev) {
532                 ret = -ENOMEM;
533                 goto out;
534         }
535         sdev->dev = dev;
536
537         ret = intel_iommu_enable_pasid(iommu, dev);
538         if (ret) {
539                 kfree(sdev);
540                 goto out;
541         }
542
543         info = get_domain_info(dev);
544         sdev->did = FLPT_DEFAULT_DID;
545         sdev->sid = PCI_DEVID(info->bus, info->devfn);
546         if (info->ats_enabled) {
547                 sdev->dev_iotlb = 1;
548                 sdev->qdep = info->ats_qdep;
549                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
550                         sdev->qdep = 0;
551         }
552
553         /* Finish the setup now we know we're keeping it */
554         sdev->users = 1;
555         sdev->ops = ops;
556         init_rcu_head(&sdev->rcu);
557
558         if (!svm) {
559                 svm = kzalloc(sizeof(*svm), GFP_KERNEL);
560                 if (!svm) {
561                         ret = -ENOMEM;
562                         kfree(sdev);
563                         goto out;
564                 }
565                 svm->iommu = iommu;
566
567                 if (pasid_max > intel_pasid_max_id)
568                         pasid_max = intel_pasid_max_id;
569
570                 /* Do not use PASID 0, reserved for RID to PASID */
571                 svm->pasid = ioasid_alloc(NULL, PASID_MIN,
572                                           pasid_max - 1, svm);
573                 if (svm->pasid == INVALID_IOASID) {
574                         kfree(svm);
575                         kfree(sdev);
576                         ret = -ENOSPC;
577                         goto out;
578                 }
579                 svm->notifier.ops = &intel_mmuops;
580                 svm->mm = mm;
581                 svm->flags = flags;
582                 INIT_LIST_HEAD_RCU(&svm->devs);
583                 INIT_LIST_HEAD(&svm->list);
584                 ret = -ENOMEM;
585                 if (mm) {
586                         ret = mmu_notifier_register(&svm->notifier, mm);
587                         if (ret) {
588                                 ioasid_free(svm->pasid);
589                                 kfree(svm);
590                                 kfree(sdev);
591                                 goto out;
592                         }
593                 }
594
595                 spin_lock(&iommu->lock);
596                 ret = intel_pasid_setup_first_level(iommu, dev,
597                                 mm ? mm->pgd : init_mm.pgd,
598                                 svm->pasid, FLPT_DEFAULT_DID,
599                                 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
600                                 (cpu_feature_enabled(X86_FEATURE_LA57) ?
601                                  PASID_FLAG_FL5LP : 0));
602                 spin_unlock(&iommu->lock);
603                 if (ret) {
604                         if (mm)
605                                 mmu_notifier_unregister(&svm->notifier, mm);
606                         ioasid_free(svm->pasid);
607                         kfree(svm);
608                         kfree(sdev);
609                         goto out;
610                 }
611
612                 list_add_tail(&svm->list, &global_svm_list);
613                 if (mm) {
614                         /* The newly allocated pasid is loaded to the mm. */
615                         load_pasid(mm, svm->pasid);
616                 }
617         } else {
618                 /*
619                  * Binding a new device with existing PASID, need to setup
620                  * the PASID entry.
621                  */
622                 spin_lock(&iommu->lock);
623                 ret = intel_pasid_setup_first_level(iommu, dev,
624                                                 mm ? mm->pgd : init_mm.pgd,
625                                                 svm->pasid, FLPT_DEFAULT_DID,
626                                                 (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) |
627                                                 (cpu_feature_enabled(X86_FEATURE_LA57) ?
628                                                 PASID_FLAG_FL5LP : 0));
629                 spin_unlock(&iommu->lock);
630                 if (ret) {
631                         kfree(sdev);
632                         goto out;
633                 }
634         }
635         list_add_rcu(&sdev->list, &svm->devs);
636 success:
637         sdev->pasid = svm->pasid;
638         sdev->sva.dev = dev;
639         if (sd)
640                 *sd = sdev;
641         ret = 0;
642 out:
643         return ret;
644 }
645
646 /* Caller must hold pasid_mutex */
647 static int intel_svm_unbind_mm(struct device *dev, u32 pasid)
648 {
649         struct intel_svm_dev *sdev;
650         struct intel_iommu *iommu;
651         struct intel_svm *svm;
652         int ret = -EINVAL;
653
654         iommu = device_to_iommu(dev, NULL, NULL);
655         if (!iommu)
656                 goto out;
657
658         ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev);
659         if (ret)
660                 goto out;
661
662         if (sdev) {
663                 sdev->users--;
664                 if (!sdev->users) {
665                         list_del_rcu(&sdev->list);
666                         /* Flush the PASID cache and IOTLB for this device.
667                          * Note that we do depend on the hardware *not* using
668                          * the PASID any more. Just as we depend on other
669                          * devices never using PASIDs that they have no right
670                          * to use. We have a *shared* PASID table, because it's
671                          * large and has to be physically contiguous. So it's
672                          * hard to be as defensive as we might like. */
673                         intel_pasid_tear_down_entry(iommu, dev,
674                                                     svm->pasid, false);
675                         intel_svm_drain_prq(dev, svm->pasid);
676                         kfree_rcu(sdev, rcu);
677
678                         if (list_empty(&svm->devs)) {
679                                 ioasid_free(svm->pasid);
680                                 if (svm->mm) {
681                                         mmu_notifier_unregister(&svm->notifier, svm->mm);
682                                         /* Clear mm's pasid. */
683                                         load_pasid(svm->mm, PASID_DISABLED);
684                                 }
685                                 list_del(&svm->list);
686                                 /* We mandate that no page faults may be outstanding
687                                  * for the PASID when intel_svm_unbind_mm() is called.
688                                  * If that is not obeyed, subtle errors will happen.
689                                  * Let's make them less subtle... */
690                                 memset(svm, 0x6b, sizeof(*svm));
691                                 kfree(svm);
692                         }
693                 }
694         }
695 out:
696         return ret;
697 }
698
699 /* Page request queue descriptor */
700 struct page_req_dsc {
701         union {
702                 struct {
703                         u64 type:8;
704                         u64 pasid_present:1;
705                         u64 priv_data_present:1;
706                         u64 rsvd:6;
707                         u64 rid:16;
708                         u64 pasid:20;
709                         u64 exe_req:1;
710                         u64 pm_req:1;
711                         u64 rsvd2:10;
712                 };
713                 u64 qw_0;
714         };
715         union {
716                 struct {
717                         u64 rd_req:1;
718                         u64 wr_req:1;
719                         u64 lpig:1;
720                         u64 prg_index:9;
721                         u64 addr:52;
722                 };
723                 u64 qw_1;
724         };
725         u64 priv_data[2];
726 };
727
728 #define PRQ_RING_MASK   ((0x1000 << PRQ_ORDER) - 0x20)
729
730 static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
731 {
732         unsigned long requested = 0;
733
734         if (req->exe_req)
735                 requested |= VM_EXEC;
736
737         if (req->rd_req)
738                 requested |= VM_READ;
739
740         if (req->wr_req)
741                 requested |= VM_WRITE;
742
743         return (requested & ~vma->vm_flags) != 0;
744 }
745
746 static bool is_canonical_address(u64 addr)
747 {
748         int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
749         long saddr = (long) addr;
750
751         return (((saddr << shift) >> shift) == saddr);
752 }
753
754 /**
755  * intel_svm_drain_prq - Drain page requests and responses for a pasid
756  * @dev: target device
757  * @pasid: pasid for draining
758  *
759  * Drain all pending page requests and responses related to @pasid in both
760  * software and hardware. This is supposed to be called after the device
761  * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB
762  * and DevTLB have been invalidated.
763  *
764  * It waits until all pending page requests for @pasid in the page fault
765  * queue are completed by the prq handling thread. Then follow the steps
766  * described in VT-d spec CH7.10 to drain all page requests and page
767  * responses pending in the hardware.
768  */
769 static void intel_svm_drain_prq(struct device *dev, u32 pasid)
770 {
771         struct device_domain_info *info;
772         struct dmar_domain *domain;
773         struct intel_iommu *iommu;
774         struct qi_desc desc[3];
775         struct pci_dev *pdev;
776         int head, tail;
777         u16 sid, did;
778         int qdep;
779
780         info = get_domain_info(dev);
781         if (WARN_ON(!info || !dev_is_pci(dev)))
782                 return;
783
784         if (!info->pri_enabled)
785                 return;
786
787         iommu = info->iommu;
788         domain = info->domain;
789         pdev = to_pci_dev(dev);
790         sid = PCI_DEVID(info->bus, info->devfn);
791         did = domain->iommu_did[iommu->seq_id];
792         qdep = pci_ats_queue_depth(pdev);
793
794         /*
795          * Check and wait until all pending page requests in the queue are
796          * handled by the prq handling thread.
797          */
798 prq_retry:
799         reinit_completion(&iommu->prq_complete);
800         tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
801         head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
802         while (head != tail) {
803                 struct page_req_dsc *req;
804
805                 req = &iommu->prq[head / sizeof(*req)];
806                 if (!req->pasid_present || req->pasid != pasid) {
807                         head = (head + sizeof(*req)) & PRQ_RING_MASK;
808                         continue;
809                 }
810
811                 wait_for_completion(&iommu->prq_complete);
812                 goto prq_retry;
813         }
814
815         /*
816          * Perform steps described in VT-d spec CH7.10 to drain page
817          * requests and responses in hardware.
818          */
819         memset(desc, 0, sizeof(desc));
820         desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
821                         QI_IWD_FENCE |
822                         QI_IWD_TYPE;
823         desc[1].qw0 = QI_EIOTLB_PASID(pasid) |
824                         QI_EIOTLB_DID(did) |
825                         QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) |
826                         QI_EIOTLB_TYPE;
827         desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) |
828                         QI_DEV_EIOTLB_SID(sid) |
829                         QI_DEV_EIOTLB_QDEP(qdep) |
830                         QI_DEIOTLB_TYPE |
831                         QI_DEV_IOTLB_PFSID(info->pfsid);
832 qi_retry:
833         reinit_completion(&iommu->prq_complete);
834         qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN);
835         if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) {
836                 wait_for_completion(&iommu->prq_complete);
837                 goto qi_retry;
838         }
839 }
840
841 static int prq_to_iommu_prot(struct page_req_dsc *req)
842 {
843         int prot = 0;
844
845         if (req->rd_req)
846                 prot |= IOMMU_FAULT_PERM_READ;
847         if (req->wr_req)
848                 prot |= IOMMU_FAULT_PERM_WRITE;
849         if (req->exe_req)
850                 prot |= IOMMU_FAULT_PERM_EXEC;
851         if (req->pm_req)
852                 prot |= IOMMU_FAULT_PERM_PRIV;
853
854         return prot;
855 }
856
857 static int
858 intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
859 {
860         struct iommu_fault_event event;
861
862         if (!dev || !dev_is_pci(dev))
863                 return -ENODEV;
864
865         /* Fill in event data for device specific processing */
866         memset(&event, 0, sizeof(struct iommu_fault_event));
867         event.fault.type = IOMMU_FAULT_PAGE_REQ;
868         event.fault.prm.addr = desc->addr;
869         event.fault.prm.pasid = desc->pasid;
870         event.fault.prm.grpid = desc->prg_index;
871         event.fault.prm.perm = prq_to_iommu_prot(desc);
872
873         if (desc->lpig)
874                 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
875         if (desc->pasid_present) {
876                 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
877                 event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
878         }
879         if (desc->priv_data_present) {
880                 /*
881                  * Set last page in group bit if private data is present,
882                  * page response is required as it does for LPIG.
883                  * iommu_report_device_fault() doesn't understand this vendor
884                  * specific requirement thus we set last_page as a workaround.
885                  */
886                 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
887                 event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
888                 memcpy(event.fault.prm.private_data, desc->priv_data,
889                        sizeof(desc->priv_data));
890         }
891
892         return iommu_report_device_fault(dev, &event);
893 }
894
895 static irqreturn_t prq_event_thread(int irq, void *d)
896 {
897         struct intel_svm_dev *sdev = NULL;
898         struct intel_iommu *iommu = d;
899         struct intel_svm *svm = NULL;
900         int head, tail, handled = 0;
901
902         /* Clear PPR bit before reading head/tail registers, to
903          * ensure that we get a new interrupt if needed. */
904         writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG);
905
906         tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK;
907         head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK;
908         while (head != tail) {
909                 struct vm_area_struct *vma;
910                 struct page_req_dsc *req;
911                 struct qi_desc resp;
912                 int result;
913                 vm_fault_t ret;
914                 u64 address;
915
916                 handled = 1;
917
918                 req = &iommu->prq[head / sizeof(*req)];
919
920                 result = QI_RESP_FAILURE;
921                 address = (u64)req->addr << VTD_PAGE_SHIFT;
922                 if (!req->pasid_present) {
923                         pr_err("%s: Page request without PASID: %08llx %08llx\n",
924                                iommu->name, ((unsigned long long *)req)[0],
925                                ((unsigned long long *)req)[1]);
926                         goto no_pasid;
927                 }
928
929                 if (!svm || svm->pasid != req->pasid) {
930                         rcu_read_lock();
931                         svm = ioasid_find(NULL, req->pasid, NULL);
932                         /* It *can't* go away, because the driver is not permitted
933                          * to unbind the mm while any page faults are outstanding.
934                          * So we only need RCU to protect the internal idr code. */
935                         rcu_read_unlock();
936                         if (IS_ERR_OR_NULL(svm)) {
937                                 pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n",
938                                        iommu->name, req->pasid, ((unsigned long long *)req)[0],
939                                        ((unsigned long long *)req)[1]);
940                                 goto no_pasid;
941                         }
942                 }
943
944                 if (!sdev || sdev->sid != req->rid) {
945                         struct intel_svm_dev *t;
946
947                         sdev = NULL;
948                         rcu_read_lock();
949                         list_for_each_entry_rcu(t, &svm->devs, list) {
950                                 if (t->sid == req->rid) {
951                                         sdev = t;
952                                         break;
953                                 }
954                         }
955                         rcu_read_unlock();
956                 }
957
958                 result = QI_RESP_INVALID;
959                 /* Since we're using init_mm.pgd directly, we should never take
960                  * any faults on kernel addresses. */
961                 if (!svm->mm)
962                         goto bad_req;
963
964                 /* If address is not canonical, return invalid response */
965                 if (!is_canonical_address(address))
966                         goto bad_req;
967
968                 /*
969                  * If prq is to be handled outside iommu driver via receiver of
970                  * the fault notifiers, we skip the page response here.
971                  */
972                 if (svm->flags & SVM_FLAG_GUEST_MODE) {
973                         if (sdev && !intel_svm_prq_report(sdev->dev, req))
974                                 goto prq_advance;
975                         else
976                                 goto bad_req;
977                 }
978
979                 /* If the mm is already defunct, don't handle faults. */
980                 if (!mmget_not_zero(svm->mm))
981                         goto bad_req;
982
983                 mmap_read_lock(svm->mm);
984                 vma = find_extend_vma(svm->mm, address);
985                 if (!vma || address < vma->vm_start)
986                         goto invalid;
987
988                 if (access_error(vma, req))
989                         goto invalid;
990
991                 ret = handle_mm_fault(vma, address,
992                                       req->wr_req ? FAULT_FLAG_WRITE : 0,
993                                       NULL);
994                 if (ret & VM_FAULT_ERROR)
995                         goto invalid;
996
997                 result = QI_RESP_SUCCESS;
998 invalid:
999                 mmap_read_unlock(svm->mm);
1000                 mmput(svm->mm);
1001 bad_req:
1002                 WARN_ON(!sdev);
1003                 if (sdev && sdev->ops && sdev->ops->fault_cb) {
1004                         int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
1005                                 (req->exe_req << 1) | (req->pm_req);
1006                         sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
1007                                             req->priv_data, rwxp, result);
1008                 }
1009                 /* We get here in the error case where the PASID lookup failed,
1010                    and these can be NULL. Do not use them below this point! */
1011                 sdev = NULL;
1012                 svm = NULL;
1013 no_pasid:
1014                 if (req->lpig || req->priv_data_present) {
1015                         /*
1016                          * Per VT-d spec. v3.0 ch7.7, system software must
1017                          * respond with page group response if private data
1018                          * is present (PDP) or last page in group (LPIG) bit
1019                          * is set. This is an additional VT-d feature beyond
1020                          * PCI ATS spec.
1021                          */
1022                         resp.qw0 = QI_PGRP_PASID(req->pasid) |
1023                                 QI_PGRP_DID(req->rid) |
1024                                 QI_PGRP_PASID_P(req->pasid_present) |
1025                                 QI_PGRP_PDP(req->pasid_present) |
1026                                 QI_PGRP_RESP_CODE(result) |
1027                                 QI_PGRP_RESP_TYPE;
1028                         resp.qw1 = QI_PGRP_IDX(req->prg_index) |
1029                                 QI_PGRP_LPIG(req->lpig);
1030
1031                         if (req->priv_data_present)
1032                                 memcpy(&resp.qw2, req->priv_data,
1033                                        sizeof(req->priv_data));
1034                         resp.qw2 = 0;
1035                         resp.qw3 = 0;
1036                         qi_submit_sync(iommu, &resp, 1, 0);
1037                 }
1038 prq_advance:
1039                 head = (head + sizeof(*req)) & PRQ_RING_MASK;
1040         }
1041
1042         dmar_writeq(iommu->reg + DMAR_PQH_REG, tail);
1043
1044         /*
1045          * Clear the page request overflow bit and wake up all threads that
1046          * are waiting for the completion of this handling.
1047          */
1048         if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO)
1049                 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG);
1050
1051         if (!completion_done(&iommu->prq_complete))
1052                 complete(&iommu->prq_complete);
1053
1054         return IRQ_RETVAL(handled);
1055 }
1056
1057 #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva)
1058 struct iommu_sva *
1059 intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata)
1060 {
1061         struct iommu_sva *sva = ERR_PTR(-EINVAL);
1062         struct intel_svm_dev *sdev = NULL;
1063         unsigned int flags = 0;
1064         int ret;
1065
1066         /*
1067          * TODO: Consolidate with generic iommu-sva bind after it is merged.
1068          * It will require shared SVM data structures, i.e. combine io_mm
1069          * and intel_svm etc.
1070          */
1071         if (drvdata)
1072                 flags = *(unsigned int *)drvdata;
1073         mutex_lock(&pasid_mutex);
1074         ret = intel_svm_bind_mm(dev, flags, NULL, mm, &sdev);
1075         if (ret)
1076                 sva = ERR_PTR(ret);
1077         else if (sdev)
1078                 sva = &sdev->sva;
1079         else
1080                 WARN(!sdev, "SVM bind succeeded with no sdev!\n");
1081
1082         mutex_unlock(&pasid_mutex);
1083
1084         return sva;
1085 }
1086
1087 void intel_svm_unbind(struct iommu_sva *sva)
1088 {
1089         struct intel_svm_dev *sdev;
1090
1091         mutex_lock(&pasid_mutex);
1092         sdev = to_intel_svm_dev(sva);
1093         intel_svm_unbind_mm(sdev->dev, sdev->pasid);
1094         mutex_unlock(&pasid_mutex);
1095 }
1096
1097 u32 intel_svm_get_pasid(struct iommu_sva *sva)
1098 {
1099         struct intel_svm_dev *sdev;
1100         u32 pasid;
1101
1102         mutex_lock(&pasid_mutex);
1103         sdev = to_intel_svm_dev(sva);
1104         pasid = sdev->pasid;
1105         mutex_unlock(&pasid_mutex);
1106
1107         return pasid;
1108 }
1109
1110 int intel_svm_page_response(struct device *dev,
1111                             struct iommu_fault_event *evt,
1112                             struct iommu_page_response *msg)
1113 {
1114         struct iommu_fault_page_request *prm;
1115         struct intel_svm_dev *sdev = NULL;
1116         struct intel_svm *svm = NULL;
1117         struct intel_iommu *iommu;
1118         bool private_present;
1119         bool pasid_present;
1120         bool last_page;
1121         u8 bus, devfn;
1122         int ret = 0;
1123         u16 sid;
1124
1125         if (!dev || !dev_is_pci(dev))
1126                 return -ENODEV;
1127
1128         iommu = device_to_iommu(dev, &bus, &devfn);
1129         if (!iommu)
1130                 return -ENODEV;
1131
1132         if (!msg || !evt)
1133                 return -EINVAL;
1134
1135         mutex_lock(&pasid_mutex);
1136
1137         prm = &evt->fault.prm;
1138         sid = PCI_DEVID(bus, devfn);
1139         pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
1140         private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
1141         last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
1142
1143         if (!pasid_present) {
1144                 ret = -EINVAL;
1145                 goto out;
1146         }
1147
1148         if (prm->pasid == 0 || prm->pasid >= PASID_MAX) {
1149                 ret = -EINVAL;
1150                 goto out;
1151         }
1152
1153         ret = pasid_to_svm_sdev(dev, prm->pasid, &svm, &sdev);
1154         if (ret || !sdev) {
1155                 ret = -ENODEV;
1156                 goto out;
1157         }
1158
1159         /*
1160          * For responses from userspace, need to make sure that the
1161          * pasid has been bound to its mm.
1162          */
1163         if (svm->flags & SVM_FLAG_GUEST_MODE) {
1164                 struct mm_struct *mm;
1165
1166                 mm = get_task_mm(current);
1167                 if (!mm) {
1168                         ret = -EINVAL;
1169                         goto out;
1170                 }
1171
1172                 if (mm != svm->mm) {
1173                         ret = -ENODEV;
1174                         mmput(mm);
1175                         goto out;
1176                 }
1177
1178                 mmput(mm);
1179         }
1180
1181         /*
1182          * Per VT-d spec. v3.0 ch7.7, system software must respond
1183          * with page group response if private data is present (PDP)
1184          * or last page in group (LPIG) bit is set. This is an
1185          * additional VT-d requirement beyond PCI ATS spec.
1186          */
1187         if (last_page || private_present) {
1188                 struct qi_desc desc;
1189
1190                 desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
1191                                 QI_PGRP_PASID_P(pasid_present) |
1192                                 QI_PGRP_PDP(private_present) |
1193                                 QI_PGRP_RESP_CODE(msg->code) |
1194                                 QI_PGRP_RESP_TYPE;
1195                 desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
1196                 desc.qw2 = 0;
1197                 desc.qw3 = 0;
1198                 if (private_present)
1199                         memcpy(&desc.qw2, prm->private_data,
1200                                sizeof(prm->private_data));
1201
1202                 qi_submit_sync(iommu, &desc, 1, 0);
1203         }
1204 out:
1205         mutex_unlock(&pasid_mutex);
1206         return ret;
1207 }