soundwire: sysfs: add slave status and device number before probe
[linux-2.6-microblaze.git] / arch / powerpc / platforms / powernv / pci-ioda.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Support PCI/PCIe on PowerNV platforms
4  *
5  * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
6  */
7
8 #undef DEBUG
9
10 #include <linux/kernel.h>
11 #include <linux/pci.h>
12 #include <linux/crash_dump.h>
13 #include <linux/delay.h>
14 #include <linux/string.h>
15 #include <linux/init.h>
16 #include <linux/memblock.h>
17 #include <linux/irq.h>
18 #include <linux/io.h>
19 #include <linux/msi.h>
20 #include <linux/iommu.h>
21 #include <linux/rculist.h>
22 #include <linux/sizes.h>
23
24 #include <asm/sections.h>
25 #include <asm/io.h>
26 #include <asm/prom.h>
27 #include <asm/pci-bridge.h>
28 #include <asm/machdep.h>
29 #include <asm/msi_bitmap.h>
30 #include <asm/ppc-pci.h>
31 #include <asm/opal.h>
32 #include <asm/iommu.h>
33 #include <asm/tce.h>
34 #include <asm/xics.h>
35 #include <asm/debugfs.h>
36 #include <asm/firmware.h>
37 #include <asm/pnv-pci.h>
38 #include <asm/mmzone.h>
39
40 #include <misc/cxl-base.h>
41
42 #include "powernv.h"
43 #include "pci.h"
44 #include "../../../../drivers/pci/pci.h"
45
46 #define PNV_IODA1_M64_NUM       16      /* Number of M64 BARs   */
47 #define PNV_IODA1_M64_SEGS      8       /* Segments per M64 BAR */
48 #define PNV_IODA1_DMA32_SEGSIZE 0x10000000
49
50 static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
51                                               "NPU_OCAPI" };
52
53 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
54 static void pnv_pci_configure_bus(struct pci_bus *bus);
55
56 void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
57                             const char *fmt, ...)
58 {
59         struct va_format vaf;
60         va_list args;
61         char pfix[32];
62
63         va_start(args, fmt);
64
65         vaf.fmt = fmt;
66         vaf.va = &args;
67
68         if (pe->flags & PNV_IODA_PE_DEV)
69                 strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
70         else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
71                 sprintf(pfix, "%04x:%02x     ",
72                         pci_domain_nr(pe->pbus), pe->pbus->number);
73 #ifdef CONFIG_PCI_IOV
74         else if (pe->flags & PNV_IODA_PE_VF)
75                 sprintf(pfix, "%04x:%02x:%2x.%d",
76                         pci_domain_nr(pe->parent_dev->bus),
77                         (pe->rid & 0xff00) >> 8,
78                         PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
79 #endif /* CONFIG_PCI_IOV*/
80
81         printk("%spci %s: [PE# %.2x] %pV",
82                level, pfix, pe->pe_number, &vaf);
83
84         va_end(args);
85 }
86
87 static bool pnv_iommu_bypass_disabled __read_mostly;
88 static bool pci_reset_phbs __read_mostly;
89
90 static int __init iommu_setup(char *str)
91 {
92         if (!str)
93                 return -EINVAL;
94
95         while (*str) {
96                 if (!strncmp(str, "nobypass", 8)) {
97                         pnv_iommu_bypass_disabled = true;
98                         pr_info("PowerNV: IOMMU bypass window disabled.\n");
99                         break;
100                 }
101                 str += strcspn(str, ",");
102                 if (*str == ',')
103                         str++;
104         }
105
106         return 0;
107 }
108 early_param("iommu", iommu_setup);
109
110 static int __init pci_reset_phbs_setup(char *str)
111 {
112         pci_reset_phbs = true;
113         return 0;
114 }
115
116 early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
117
118 static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
119 {
120         s64 rc;
121
122         phb->ioda.pe_array[pe_no].phb = phb;
123         phb->ioda.pe_array[pe_no].pe_number = pe_no;
124         phb->ioda.pe_array[pe_no].dma_setup_done = false;
125
126         /*
127          * Clear the PE frozen state as it might be put into frozen state
128          * in the last PCI remove path. It's not harmful to do so when the
129          * PE is already in unfrozen state.
130          */
131         rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
132                                        OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
133         if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
134                 pr_warn("%s: Error %lld unfreezing PHB#%x-PE#%x\n",
135                         __func__, rc, phb->hose->global_number, pe_no);
136
137         return &phb->ioda.pe_array[pe_no];
138 }
139
140 static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
141 {
142         if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
143                 pr_warn("%s: Invalid PE %x on PHB#%x\n",
144                         __func__, pe_no, phb->hose->global_number);
145                 return;
146         }
147
148         mutex_lock(&phb->ioda.pe_alloc_mutex);
149         if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
150                 pr_debug("%s: PE %x was reserved on PHB#%x\n",
151                          __func__, pe_no, phb->hose->global_number);
152         mutex_unlock(&phb->ioda.pe_alloc_mutex);
153
154         pnv_ioda_init_pe(phb, pe_no);
155 }
156
157 struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count)
158 {
159         struct pnv_ioda_pe *ret = NULL;
160         int run = 0, pe, i;
161
162         mutex_lock(&phb->ioda.pe_alloc_mutex);
163
164         /* scan backwards for a run of @count cleared bits */
165         for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
166                 if (test_bit(pe, phb->ioda.pe_alloc)) {
167                         run = 0;
168                         continue;
169                 }
170
171                 run++;
172                 if (run == count)
173                         break;
174         }
175         if (run != count)
176                 goto out;
177
178         for (i = pe; i < pe + count; i++) {
179                 set_bit(i, phb->ioda.pe_alloc);
180                 pnv_ioda_init_pe(phb, i);
181         }
182         ret = &phb->ioda.pe_array[pe];
183
184 out:
185         mutex_unlock(&phb->ioda.pe_alloc_mutex);
186         return ret;
187 }
188
189 void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
190 {
191         struct pnv_phb *phb = pe->phb;
192         unsigned int pe_num = pe->pe_number;
193
194         WARN_ON(pe->pdev);
195         WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */
196         kfree(pe->npucomp);
197         memset(pe, 0, sizeof(struct pnv_ioda_pe));
198
199         mutex_lock(&phb->ioda.pe_alloc_mutex);
200         clear_bit(pe_num, phb->ioda.pe_alloc);
201         mutex_unlock(&phb->ioda.pe_alloc_mutex);
202 }
203
204 /* The default M64 BAR is shared by all PEs */
205 static int pnv_ioda2_init_m64(struct pnv_phb *phb)
206 {
207         const char *desc;
208         struct resource *r;
209         s64 rc;
210
211         /* Configure the default M64 BAR */
212         rc = opal_pci_set_phb_mem_window(phb->opal_id,
213                                          OPAL_M64_WINDOW_TYPE,
214                                          phb->ioda.m64_bar_idx,
215                                          phb->ioda.m64_base,
216                                          0, /* unused */
217                                          phb->ioda.m64_size);
218         if (rc != OPAL_SUCCESS) {
219                 desc = "configuring";
220                 goto fail;
221         }
222
223         /* Enable the default M64 BAR */
224         rc = opal_pci_phb_mmio_enable(phb->opal_id,
225                                       OPAL_M64_WINDOW_TYPE,
226                                       phb->ioda.m64_bar_idx,
227                                       OPAL_ENABLE_M64_SPLIT);
228         if (rc != OPAL_SUCCESS) {
229                 desc = "enabling";
230                 goto fail;
231         }
232
233         /*
234          * Exclude the segments for reserved and root bus PE, which
235          * are first or last two PEs.
236          */
237         r = &phb->hose->mem_resources[1];
238         if (phb->ioda.reserved_pe_idx == 0)
239                 r->start += (2 * phb->ioda.m64_segsize);
240         else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
241                 r->end -= (2 * phb->ioda.m64_segsize);
242         else
243                 pr_warn("  Cannot strip M64 segment for reserved PE#%x\n",
244                         phb->ioda.reserved_pe_idx);
245
246         return 0;
247
248 fail:
249         pr_warn("  Failure %lld %s M64 BAR#%d\n",
250                 rc, desc, phb->ioda.m64_bar_idx);
251         opal_pci_phb_mmio_enable(phb->opal_id,
252                                  OPAL_M64_WINDOW_TYPE,
253                                  phb->ioda.m64_bar_idx,
254                                  OPAL_DISABLE_M64);
255         return -EIO;
256 }
257
258 static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
259                                          unsigned long *pe_bitmap)
260 {
261         struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
262         struct resource *r;
263         resource_size_t base, sgsz, start, end;
264         int segno, i;
265
266         base = phb->ioda.m64_base;
267         sgsz = phb->ioda.m64_segsize;
268         for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
269                 r = &pdev->resource[i];
270                 if (!r->parent || !pnv_pci_is_m64(phb, r))
271                         continue;
272
273                 start = ALIGN_DOWN(r->start - base, sgsz);
274                 end = ALIGN(r->end - base, sgsz);
275                 for (segno = start / sgsz; segno < end / sgsz; segno++) {
276                         if (pe_bitmap)
277                                 set_bit(segno, pe_bitmap);
278                         else
279                                 pnv_ioda_reserve_pe(phb, segno);
280                 }
281         }
282 }
283
284 static int pnv_ioda1_init_m64(struct pnv_phb *phb)
285 {
286         struct resource *r;
287         int index;
288
289         /*
290          * There are 16 M64 BARs, each of which has 8 segments. So
291          * there are as many M64 segments as the maximum number of
292          * PEs, which is 128.
293          */
294         for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
295                 unsigned long base, segsz = phb->ioda.m64_segsize;
296                 int64_t rc;
297
298                 base = phb->ioda.m64_base +
299                        index * PNV_IODA1_M64_SEGS * segsz;
300                 rc = opal_pci_set_phb_mem_window(phb->opal_id,
301                                 OPAL_M64_WINDOW_TYPE, index, base, 0,
302                                 PNV_IODA1_M64_SEGS * segsz);
303                 if (rc != OPAL_SUCCESS) {
304                         pr_warn("  Error %lld setting M64 PHB#%x-BAR#%d\n",
305                                 rc, phb->hose->global_number, index);
306                         goto fail;
307                 }
308
309                 rc = opal_pci_phb_mmio_enable(phb->opal_id,
310                                 OPAL_M64_WINDOW_TYPE, index,
311                                 OPAL_ENABLE_M64_SPLIT);
312                 if (rc != OPAL_SUCCESS) {
313                         pr_warn("  Error %lld enabling M64 PHB#%x-BAR#%d\n",
314                                 rc, phb->hose->global_number, index);
315                         goto fail;
316                 }
317         }
318
319         for (index = 0; index < phb->ioda.total_pe_num; index++) {
320                 int64_t rc;
321
322                 /*
323                  * P7IOC supports M64DT, which helps mapping M64 segment
324                  * to one particular PE#. However, PHB3 has fixed mapping
325                  * between M64 segment and PE#. In order to have same logic
326                  * for P7IOC and PHB3, we enforce fixed mapping between M64
327                  * segment and PE# on P7IOC.
328                  */
329                 rc = opal_pci_map_pe_mmio_window(phb->opal_id,
330                                 index, OPAL_M64_WINDOW_TYPE,
331                                 index / PNV_IODA1_M64_SEGS,
332                                 index % PNV_IODA1_M64_SEGS);
333                 if (rc != OPAL_SUCCESS) {
334                         pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
335                                 __func__, rc, phb->hose->global_number,
336                                 index);
337                         goto fail;
338                 }
339         }
340
341         /*
342          * Exclude the segments for reserved and root bus PE, which
343          * are first or last two PEs.
344          */
345         r = &phb->hose->mem_resources[1];
346         if (phb->ioda.reserved_pe_idx == 0)
347                 r->start += (2 * phb->ioda.m64_segsize);
348         else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
349                 r->end -= (2 * phb->ioda.m64_segsize);
350         else
351                 WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
352                      phb->ioda.reserved_pe_idx, phb->hose->global_number);
353
354         return 0;
355
356 fail:
357         for ( ; index >= 0; index--)
358                 opal_pci_phb_mmio_enable(phb->opal_id,
359                         OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
360
361         return -EIO;
362 }
363
364 static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
365                                     unsigned long *pe_bitmap,
366                                     bool all)
367 {
368         struct pci_dev *pdev;
369
370         list_for_each_entry(pdev, &bus->devices, bus_list) {
371                 pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
372
373                 if (all && pdev->subordinate)
374                         pnv_ioda_reserve_m64_pe(pdev->subordinate,
375                                                 pe_bitmap, all);
376         }
377 }
378
379 static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
380 {
381         struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
382         struct pnv_ioda_pe *master_pe, *pe;
383         unsigned long size, *pe_alloc;
384         int i;
385
386         /* Root bus shouldn't use M64 */
387         if (pci_is_root_bus(bus))
388                 return NULL;
389
390         /* Allocate bitmap */
391         size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
392         pe_alloc = kzalloc(size, GFP_KERNEL);
393         if (!pe_alloc) {
394                 pr_warn("%s: Out of memory !\n",
395                         __func__);
396                 return NULL;
397         }
398
399         /* Figure out reserved PE numbers by the PE */
400         pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
401
402         /*
403          * the current bus might not own M64 window and that's all
404          * contributed by its child buses. For the case, we needn't
405          * pick M64 dependent PE#.
406          */
407         if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
408                 kfree(pe_alloc);
409                 return NULL;
410         }
411
412         /*
413          * Figure out the master PE and put all slave PEs to master
414          * PE's list to form compound PE.
415          */
416         master_pe = NULL;
417         i = -1;
418         while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
419                 phb->ioda.total_pe_num) {
420                 pe = &phb->ioda.pe_array[i];
421
422                 phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
423                 if (!master_pe) {
424                         pe->flags |= PNV_IODA_PE_MASTER;
425                         INIT_LIST_HEAD(&pe->slaves);
426                         master_pe = pe;
427                 } else {
428                         pe->flags |= PNV_IODA_PE_SLAVE;
429                         pe->master = master_pe;
430                         list_add_tail(&pe->list, &master_pe->slaves);
431                 }
432         }
433
434         kfree(pe_alloc);
435         return master_pe;
436 }
437
438 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
439 {
440         struct pci_controller *hose = phb->hose;
441         struct device_node *dn = hose->dn;
442         struct resource *res;
443         u32 m64_range[2], i;
444         const __be32 *r;
445         u64 pci_addr;
446
447         if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
448                 pr_info("  Not support M64 window\n");
449                 return;
450         }
451
452         if (!firmware_has_feature(FW_FEATURE_OPAL)) {
453                 pr_info("  Firmware too old to support M64 window\n");
454                 return;
455         }
456
457         r = of_get_property(dn, "ibm,opal-m64-window", NULL);
458         if (!r) {
459                 pr_info("  No <ibm,opal-m64-window> on %pOF\n",
460                         dn);
461                 return;
462         }
463
464         /*
465          * Find the available M64 BAR range and pickup the last one for
466          * covering the whole 64-bits space. We support only one range.
467          */
468         if (of_property_read_u32_array(dn, "ibm,opal-available-m64-ranges",
469                                        m64_range, 2)) {
470                 /* In absence of the property, assume 0..15 */
471                 m64_range[0] = 0;
472                 m64_range[1] = 16;
473         }
474         /* We only support 64 bits in our allocator */
475         if (m64_range[1] > 63) {
476                 pr_warn("%s: Limiting M64 range to 63 (from %d) on PHB#%x\n",
477                         __func__, m64_range[1], phb->hose->global_number);
478                 m64_range[1] = 63;
479         }
480         /* Empty range, no m64 */
481         if (m64_range[1] <= m64_range[0]) {
482                 pr_warn("%s: M64 empty, disabling M64 usage on PHB#%x\n",
483                         __func__, phb->hose->global_number);
484                 return;
485         }
486
487         /* Configure M64 informations */
488         res = &hose->mem_resources[1];
489         res->name = dn->full_name;
490         res->start = of_translate_address(dn, r + 2);
491         res->end = res->start + of_read_number(r + 4, 2) - 1;
492         res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
493         pci_addr = of_read_number(r, 2);
494         hose->mem_offset[1] = res->start - pci_addr;
495
496         phb->ioda.m64_size = resource_size(res);
497         phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
498         phb->ioda.m64_base = pci_addr;
499
500         /* This lines up nicely with the display from processing OF ranges */
501         pr_info(" MEM 0x%016llx..0x%016llx -> 0x%016llx (M64 #%d..%d)\n",
502                 res->start, res->end, pci_addr, m64_range[0],
503                 m64_range[0] + m64_range[1] - 1);
504
505         /* Mark all M64 used up by default */
506         phb->ioda.m64_bar_alloc = (unsigned long)-1;
507
508         /* Use last M64 BAR to cover M64 window */
509         m64_range[1]--;
510         phb->ioda.m64_bar_idx = m64_range[0] + m64_range[1];
511
512         pr_info(" Using M64 #%d as default window\n", phb->ioda.m64_bar_idx);
513
514         /* Mark remaining ones free */
515         for (i = m64_range[0]; i < m64_range[1]; i++)
516                 clear_bit(i, &phb->ioda.m64_bar_alloc);
517
518         /*
519          * Setup init functions for M64 based on IODA version, IODA3 uses
520          * the IODA2 code.
521          */
522         if (phb->type == PNV_PHB_IODA1)
523                 phb->init_m64 = pnv_ioda1_init_m64;
524         else
525                 phb->init_m64 = pnv_ioda2_init_m64;
526 }
527
528 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
529 {
530         struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
531         struct pnv_ioda_pe *slave;
532         s64 rc;
533
534         /* Fetch master PE */
535         if (pe->flags & PNV_IODA_PE_SLAVE) {
536                 pe = pe->master;
537                 if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
538                         return;
539
540                 pe_no = pe->pe_number;
541         }
542
543         /* Freeze master PE */
544         rc = opal_pci_eeh_freeze_set(phb->opal_id,
545                                      pe_no,
546                                      OPAL_EEH_ACTION_SET_FREEZE_ALL);
547         if (rc != OPAL_SUCCESS) {
548                 pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
549                         __func__, rc, phb->hose->global_number, pe_no);
550                 return;
551         }
552
553         /* Freeze slave PEs */
554         if (!(pe->flags & PNV_IODA_PE_MASTER))
555                 return;
556
557         list_for_each_entry(slave, &pe->slaves, list) {
558                 rc = opal_pci_eeh_freeze_set(phb->opal_id,
559                                              slave->pe_number,
560                                              OPAL_EEH_ACTION_SET_FREEZE_ALL);
561                 if (rc != OPAL_SUCCESS)
562                         pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
563                                 __func__, rc, phb->hose->global_number,
564                                 slave->pe_number);
565         }
566 }
567
568 static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
569 {
570         struct pnv_ioda_pe *pe, *slave;
571         s64 rc;
572
573         /* Find master PE */
574         pe = &phb->ioda.pe_array[pe_no];
575         if (pe->flags & PNV_IODA_PE_SLAVE) {
576                 pe = pe->master;
577                 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
578                 pe_no = pe->pe_number;
579         }
580
581         /* Clear frozen state for master PE */
582         rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
583         if (rc != OPAL_SUCCESS) {
584                 pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
585                         __func__, rc, opt, phb->hose->global_number, pe_no);
586                 return -EIO;
587         }
588
589         if (!(pe->flags & PNV_IODA_PE_MASTER))
590                 return 0;
591
592         /* Clear frozen state for slave PEs */
593         list_for_each_entry(slave, &pe->slaves, list) {
594                 rc = opal_pci_eeh_freeze_clear(phb->opal_id,
595                                              slave->pe_number,
596                                              opt);
597                 if (rc != OPAL_SUCCESS) {
598                         pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
599                                 __func__, rc, opt, phb->hose->global_number,
600                                 slave->pe_number);
601                         return -EIO;
602                 }
603         }
604
605         return 0;
606 }
607
608 static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
609 {
610         struct pnv_ioda_pe *slave, *pe;
611         u8 fstate = 0, state;
612         __be16 pcierr = 0;
613         s64 rc;
614
615         /* Sanity check on PE number */
616         if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
617                 return OPAL_EEH_STOPPED_PERM_UNAVAIL;
618
619         /*
620          * Fetch the master PE and the PE instance might be
621          * not initialized yet.
622          */
623         pe = &phb->ioda.pe_array[pe_no];
624         if (pe->flags & PNV_IODA_PE_SLAVE) {
625                 pe = pe->master;
626                 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
627                 pe_no = pe->pe_number;
628         }
629
630         /* Check the master PE */
631         rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
632                                         &state, &pcierr, NULL);
633         if (rc != OPAL_SUCCESS) {
634                 pr_warn("%s: Failure %lld getting "
635                         "PHB#%x-PE#%x state\n",
636                         __func__, rc,
637                         phb->hose->global_number, pe_no);
638                 return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
639         }
640
641         /* Check the slave PE */
642         if (!(pe->flags & PNV_IODA_PE_MASTER))
643                 return state;
644
645         list_for_each_entry(slave, &pe->slaves, list) {
646                 rc = opal_pci_eeh_freeze_status(phb->opal_id,
647                                                 slave->pe_number,
648                                                 &fstate,
649                                                 &pcierr,
650                                                 NULL);
651                 if (rc != OPAL_SUCCESS) {
652                         pr_warn("%s: Failure %lld getting "
653                                 "PHB#%x-PE#%x state\n",
654                                 __func__, rc,
655                                 phb->hose->global_number, slave->pe_number);
656                         return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
657                 }
658
659                 /*
660                  * Override the result based on the ascending
661                  * priority.
662                  */
663                 if (fstate > state)
664                         state = fstate;
665         }
666
667         return state;
668 }
669
670 struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn)
671 {
672         int pe_number = phb->ioda.pe_rmap[bdfn];
673
674         if (pe_number == IODA_INVALID_PE)
675                 return NULL;
676
677         return &phb->ioda.pe_array[pe_number];
678 }
679
680 struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
681 {
682         struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
683         struct pci_dn *pdn = pci_get_pdn(dev);
684
685         if (!pdn)
686                 return NULL;
687         if (pdn->pe_number == IODA_INVALID_PE)
688                 return NULL;
689         return &phb->ioda.pe_array[pdn->pe_number];
690 }
691
692 static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
693                                   struct pnv_ioda_pe *parent,
694                                   struct pnv_ioda_pe *child,
695                                   bool is_add)
696 {
697         const char *desc = is_add ? "adding" : "removing";
698         uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
699                               OPAL_REMOVE_PE_FROM_DOMAIN;
700         struct pnv_ioda_pe *slave;
701         long rc;
702
703         /* Parent PE affects child PE */
704         rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
705                                 child->pe_number, op);
706         if (rc != OPAL_SUCCESS) {
707                 pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
708                         rc, desc);
709                 return -ENXIO;
710         }
711
712         if (!(child->flags & PNV_IODA_PE_MASTER))
713                 return 0;
714
715         /* Compound case: parent PE affects slave PEs */
716         list_for_each_entry(slave, &child->slaves, list) {
717                 rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
718                                         slave->pe_number, op);
719                 if (rc != OPAL_SUCCESS) {
720                         pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
721                                 rc, desc);
722                         return -ENXIO;
723                 }
724         }
725
726         return 0;
727 }
728
729 static int pnv_ioda_set_peltv(struct pnv_phb *phb,
730                               struct pnv_ioda_pe *pe,
731                               bool is_add)
732 {
733         struct pnv_ioda_pe *slave;
734         struct pci_dev *pdev = NULL;
735         int ret;
736
737         /*
738          * Clear PE frozen state. If it's master PE, we need
739          * clear slave PE frozen state as well.
740          */
741         if (is_add) {
742                 opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
743                                           OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
744                 if (pe->flags & PNV_IODA_PE_MASTER) {
745                         list_for_each_entry(slave, &pe->slaves, list)
746                                 opal_pci_eeh_freeze_clear(phb->opal_id,
747                                                           slave->pe_number,
748                                                           OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
749                 }
750         }
751
752         /*
753          * Associate PE in PELT. We need add the PE into the
754          * corresponding PELT-V as well. Otherwise, the error
755          * originated from the PE might contribute to other
756          * PEs.
757          */
758         ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
759         if (ret)
760                 return ret;
761
762         /* For compound PEs, any one affects all of them */
763         if (pe->flags & PNV_IODA_PE_MASTER) {
764                 list_for_each_entry(slave, &pe->slaves, list) {
765                         ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
766                         if (ret)
767                                 return ret;
768                 }
769         }
770
771         if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
772                 pdev = pe->pbus->self;
773         else if (pe->flags & PNV_IODA_PE_DEV)
774                 pdev = pe->pdev->bus->self;
775 #ifdef CONFIG_PCI_IOV
776         else if (pe->flags & PNV_IODA_PE_VF)
777                 pdev = pe->parent_dev;
778 #endif /* CONFIG_PCI_IOV */
779         while (pdev) {
780                 struct pci_dn *pdn = pci_get_pdn(pdev);
781                 struct pnv_ioda_pe *parent;
782
783                 if (pdn && pdn->pe_number != IODA_INVALID_PE) {
784                         parent = &phb->ioda.pe_array[pdn->pe_number];
785                         ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
786                         if (ret)
787                                 return ret;
788                 }
789
790                 pdev = pdev->bus->self;
791         }
792
793         return 0;
794 }
795
796 static void pnv_ioda_unset_peltv(struct pnv_phb *phb,
797                                  struct pnv_ioda_pe *pe,
798                                  struct pci_dev *parent)
799 {
800         int64_t rc;
801
802         while (parent) {
803                 struct pci_dn *pdn = pci_get_pdn(parent);
804
805                 if (pdn && pdn->pe_number != IODA_INVALID_PE) {
806                         rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
807                                                 pe->pe_number,
808                                                 OPAL_REMOVE_PE_FROM_DOMAIN);
809                         /* XXX What to do in case of error ? */
810                 }
811                 parent = parent->bus->self;
812         }
813
814         opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
815                                   OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
816
817         /* Disassociate PE in PELT */
818         rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
819                                 pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
820         if (rc)
821                 pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
822 }
823
824 int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
825 {
826         struct pci_dev *parent;
827         uint8_t bcomp, dcomp, fcomp;
828         int64_t rc;
829         long rid_end, rid;
830
831         /* Currently, we just deconfigure VF PE. Bus PE will always there.*/
832         if (pe->pbus) {
833                 int count;
834
835                 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
836                 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
837                 parent = pe->pbus->self;
838                 if (pe->flags & PNV_IODA_PE_BUS_ALL)
839                         count = resource_size(&pe->pbus->busn_res);
840                 else
841                         count = 1;
842
843                 switch(count) {
844                 case  1: bcomp = OpalPciBusAll;         break;
845                 case  2: bcomp = OpalPciBus7Bits;       break;
846                 case  4: bcomp = OpalPciBus6Bits;       break;
847                 case  8: bcomp = OpalPciBus5Bits;       break;
848                 case 16: bcomp = OpalPciBus4Bits;       break;
849                 case 32: bcomp = OpalPciBus3Bits;       break;
850                 default:
851                         dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
852                                 count);
853                         /* Do an exact match only */
854                         bcomp = OpalPciBusAll;
855                 }
856                 rid_end = pe->rid + (count << 8);
857         } else {
858 #ifdef CONFIG_PCI_IOV
859                 if (pe->flags & PNV_IODA_PE_VF)
860                         parent = pe->parent_dev;
861                 else
862 #endif
863                         parent = pe->pdev->bus->self;
864                 bcomp = OpalPciBusAll;
865                 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
866                 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
867                 rid_end = pe->rid + 1;
868         }
869
870         /* Clear the reverse map */
871         for (rid = pe->rid; rid < rid_end; rid++)
872                 phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
873
874         /*
875          * Release from all parents PELT-V. NPUs don't have a PELTV
876          * table
877          */
878         if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
879                 pnv_ioda_unset_peltv(phb, pe, parent);
880
881         rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
882                              bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
883         if (rc)
884                 pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc);
885
886         pe->pbus = NULL;
887         pe->pdev = NULL;
888 #ifdef CONFIG_PCI_IOV
889         pe->parent_dev = NULL;
890 #endif
891
892         return 0;
893 }
894
895 int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
896 {
897         struct pci_dev *parent;
898         uint8_t bcomp, dcomp, fcomp;
899         long rc, rid_end, rid;
900
901         /* Bus validation ? */
902         if (pe->pbus) {
903                 int count;
904
905                 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
906                 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
907                 parent = pe->pbus->self;
908                 if (pe->flags & PNV_IODA_PE_BUS_ALL)
909                         count = resource_size(&pe->pbus->busn_res);
910                 else
911                         count = 1;
912
913                 switch(count) {
914                 case  1: bcomp = OpalPciBusAll;         break;
915                 case  2: bcomp = OpalPciBus7Bits;       break;
916                 case  4: bcomp = OpalPciBus6Bits;       break;
917                 case  8: bcomp = OpalPciBus5Bits;       break;
918                 case 16: bcomp = OpalPciBus4Bits;       break;
919                 case 32: bcomp = OpalPciBus3Bits;       break;
920                 default:
921                         dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
922                                 count);
923                         /* Do an exact match only */
924                         bcomp = OpalPciBusAll;
925                 }
926                 rid_end = pe->rid + (count << 8);
927         } else {
928 #ifdef CONFIG_PCI_IOV
929                 if (pe->flags & PNV_IODA_PE_VF)
930                         parent = pe->parent_dev;
931                 else
932 #endif /* CONFIG_PCI_IOV */
933                         parent = pe->pdev->bus->self;
934                 bcomp = OpalPciBusAll;
935                 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
936                 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
937                 rid_end = pe->rid + 1;
938         }
939
940         /*
941          * Associate PE in PELT. We need add the PE into the
942          * corresponding PELT-V as well. Otherwise, the error
943          * originated from the PE might contribute to other
944          * PEs.
945          */
946         rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
947                              bcomp, dcomp, fcomp, OPAL_MAP_PE);
948         if (rc) {
949                 pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
950                 return -ENXIO;
951         }
952
953         /*
954          * Configure PELTV. NPUs don't have a PELTV table so skip
955          * configuration on them.
956          */
957         if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
958                 pnv_ioda_set_peltv(phb, pe, true);
959
960         /* Setup reverse map */
961         for (rid = pe->rid; rid < rid_end; rid++)
962                 phb->ioda.pe_rmap[rid] = pe->pe_number;
963
964         /* Setup one MVTs on IODA1 */
965         if (phb->type != PNV_PHB_IODA1) {
966                 pe->mve_number = 0;
967                 goto out;
968         }
969
970         pe->mve_number = pe->pe_number;
971         rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
972         if (rc != OPAL_SUCCESS) {
973                 pe_err(pe, "OPAL error %ld setting up MVE %x\n",
974                        rc, pe->mve_number);
975                 pe->mve_number = -1;
976         } else {
977                 rc = opal_pci_set_mve_enable(phb->opal_id,
978                                              pe->mve_number, OPAL_ENABLE_MVE);
979                 if (rc) {
980                         pe_err(pe, "OPAL error %ld enabling MVE %x\n",
981                                rc, pe->mve_number);
982                         pe->mve_number = -1;
983                 }
984         }
985
986 out:
987         return 0;
988 }
989
990 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
991 {
992         struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
993         struct pci_dn *pdn = pci_get_pdn(dev);
994         struct pnv_ioda_pe *pe;
995
996         if (!pdn) {
997                 pr_err("%s: Device tree node not associated properly\n",
998                            pci_name(dev));
999                 return NULL;
1000         }
1001         if (pdn->pe_number != IODA_INVALID_PE)
1002                 return NULL;
1003
1004         pe = pnv_ioda_alloc_pe(phb, 1);
1005         if (!pe) {
1006                 pr_warn("%s: Not enough PE# available, disabling device\n",
1007                         pci_name(dev));
1008                 return NULL;
1009         }
1010
1011         /* NOTE: We don't get a reference for the pointer in the PE
1012          * data structure, both the device and PE structures should be
1013          * destroyed at the same time. However, removing nvlink
1014          * devices will need some work.
1015          *
1016          * At some point we want to remove the PDN completely anyways
1017          */
1018         pdn->pe_number = pe->pe_number;
1019         pe->flags = PNV_IODA_PE_DEV;
1020         pe->pdev = dev;
1021         pe->pbus = NULL;
1022         pe->mve_number = -1;
1023         pe->rid = dev->bus->number << 8 | pdn->devfn;
1024         pe->device_count++;
1025
1026         pe_info(pe, "Associated device to PE\n");
1027
1028         if (pnv_ioda_configure_pe(phb, pe)) {
1029                 /* XXX What do we do here ? */
1030                 pnv_ioda_free_pe(pe);
1031                 pdn->pe_number = IODA_INVALID_PE;
1032                 pe->pdev = NULL;
1033                 return NULL;
1034         }
1035
1036         /* Put PE to the list */
1037         mutex_lock(&phb->ioda.pe_list_mutex);
1038         list_add_tail(&pe->list, &phb->ioda.pe_list);
1039         mutex_unlock(&phb->ioda.pe_list_mutex);
1040         return pe;
1041 }
1042
1043 /*
1044  * There're 2 types of PCI bus sensitive PEs: One that is compromised of
1045  * single PCI bus. Another one that contains the primary PCI bus and its
1046  * subordinate PCI devices and buses. The second type of PE is normally
1047  * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
1048  */
1049 static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
1050 {
1051         struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
1052         struct pnv_ioda_pe *pe = NULL;
1053         unsigned int pe_num;
1054
1055         /*
1056          * In partial hotplug case, the PE instance might be still alive.
1057          * We should reuse it instead of allocating a new one.
1058          */
1059         pe_num = phb->ioda.pe_rmap[bus->number << 8];
1060         if (WARN_ON(pe_num != IODA_INVALID_PE)) {
1061                 pe = &phb->ioda.pe_array[pe_num];
1062                 return NULL;
1063         }
1064
1065         /* PE number for root bus should have been reserved */
1066         if (pci_is_root_bus(bus))
1067                 pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
1068
1069         /* Check if PE is determined by M64 */
1070         if (!pe)
1071                 pe = pnv_ioda_pick_m64_pe(bus, all);
1072
1073         /* The PE number isn't pinned by M64 */
1074         if (!pe)
1075                 pe = pnv_ioda_alloc_pe(phb, 1);
1076
1077         if (!pe) {
1078                 pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
1079                         __func__, pci_domain_nr(bus), bus->number);
1080                 return NULL;
1081         }
1082
1083         pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
1084         pe->pbus = bus;
1085         pe->pdev = NULL;
1086         pe->mve_number = -1;
1087         pe->rid = bus->busn_res.start << 8;
1088
1089         if (all)
1090                 pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n",
1091                         &bus->busn_res.start, &bus->busn_res.end,
1092                         pe->pe_number);
1093         else
1094                 pe_info(pe, "Secondary bus %pad associated with PE#%x\n",
1095                         &bus->busn_res.start, pe->pe_number);
1096
1097         if (pnv_ioda_configure_pe(phb, pe)) {
1098                 /* XXX What do we do here ? */
1099                 pnv_ioda_free_pe(pe);
1100                 pe->pbus = NULL;
1101                 return NULL;
1102         }
1103
1104         /* Put PE to the list */
1105         list_add_tail(&pe->list, &phb->ioda.pe_list);
1106
1107         return pe;
1108 }
1109
1110 static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
1111 {
1112         int pe_num, found_pe = false, rc;
1113         long rid;
1114         struct pnv_ioda_pe *pe;
1115         struct pci_dev *gpu_pdev;
1116         struct pci_dn *npu_pdn;
1117         struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus);
1118
1119         /*
1120          * Intentionally leak a reference on the npu device (for
1121          * nvlink only; this is not an opencapi path) to make sure it
1122          * never goes away, as it's been the case all along and some
1123          * work is needed otherwise.
1124          */
1125         pci_dev_get(npu_pdev);
1126
1127         /*
1128          * Due to a hardware errata PE#0 on the NPU is reserved for
1129          * error handling. This means we only have three PEs remaining
1130          * which need to be assigned to four links, implying some
1131          * links must share PEs.
1132          *
1133          * To achieve this we assign PEs such that NPUs linking the
1134          * same GPU get assigned the same PE.
1135          */
1136         gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
1137         for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
1138                 pe = &phb->ioda.pe_array[pe_num];
1139                 if (!pe->pdev)
1140                         continue;
1141
1142                 if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
1143                         /*
1144                          * This device has the same peer GPU so should
1145                          * be assigned the same PE as the existing
1146                          * peer NPU.
1147                          */
1148                         dev_info(&npu_pdev->dev,
1149                                 "Associating to existing PE %x\n", pe_num);
1150                         npu_pdn = pci_get_pdn(npu_pdev);
1151                         rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
1152                         npu_pdn->pe_number = pe_num;
1153                         phb->ioda.pe_rmap[rid] = pe->pe_number;
1154                         pe->device_count++;
1155
1156                         /* Map the PE to this link */
1157                         rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
1158                                         OpalPciBusAll,
1159                                         OPAL_COMPARE_RID_DEVICE_NUMBER,
1160                                         OPAL_COMPARE_RID_FUNCTION_NUMBER,
1161                                         OPAL_MAP_PE);
1162                         WARN_ON(rc != OPAL_SUCCESS);
1163                         found_pe = true;
1164                         break;
1165                 }
1166         }
1167
1168         if (!found_pe)
1169                 /*
1170                  * Could not find an existing PE so allocate a new
1171                  * one.
1172                  */
1173                 return pnv_ioda_setup_dev_PE(npu_pdev);
1174         else
1175                 return pe;
1176 }
1177
1178 static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
1179 {
1180         struct pci_dev *pdev;
1181
1182         list_for_each_entry(pdev, &bus->devices, bus_list)
1183                 pnv_ioda_setup_npu_PE(pdev);
1184 }
1185
1186 static void pnv_pci_ioda_setup_nvlink(void)
1187 {
1188         struct pci_controller *hose;
1189         struct pnv_phb *phb;
1190         struct pnv_ioda_pe *pe;
1191
1192         list_for_each_entry(hose, &hose_list, list_node) {
1193                 phb = hose->private_data;
1194                 if (phb->type == PNV_PHB_NPU_NVLINK) {
1195                         /* PE#0 is needed for error reporting */
1196                         pnv_ioda_reserve_pe(phb, 0);
1197                         pnv_ioda_setup_npu_PEs(hose->bus);
1198                         if (phb->model == PNV_PHB_MODEL_NPU2)
1199                                 WARN_ON_ONCE(pnv_npu2_init(hose));
1200                 }
1201         }
1202         list_for_each_entry(hose, &hose_list, list_node) {
1203                 phb = hose->private_data;
1204                 if (phb->type != PNV_PHB_IODA2)
1205                         continue;
1206
1207                 list_for_each_entry(pe, &phb->ioda.pe_list, list)
1208                         pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
1209         }
1210
1211 #ifdef CONFIG_IOMMU_API
1212         /* setup iommu groups so we can do nvlink pass-thru */
1213         pnv_pci_npu_setup_iommu_groups();
1214 #endif
1215 }
1216
1217 static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
1218                                        struct pnv_ioda_pe *pe);
1219
1220 static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
1221 {
1222         struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
1223         struct pci_dn *pdn = pci_get_pdn(pdev);
1224         struct pnv_ioda_pe *pe;
1225
1226         /* Check if the BDFN for this device is associated with a PE yet */
1227         pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
1228         if (!pe) {
1229                 /* VF PEs should be pre-configured in pnv_pci_sriov_enable() */
1230                 if (WARN_ON(pdev->is_virtfn))
1231                         return;
1232
1233                 pnv_pci_configure_bus(pdev->bus);
1234                 pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
1235                 pci_info(pdev, "Configured PE#%x\n", pe ? pe->pe_number : 0xfffff);
1236
1237
1238                 /*
1239                  * If we can't setup the IODA PE something has gone horribly
1240                  * wrong and we can't enable DMA for the device.
1241                  */
1242                 if (WARN_ON(!pe))
1243                         return;
1244         } else {
1245                 pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number);
1246         }
1247
1248         /*
1249          * We assume that bridges *probably* don't need to do any DMA so we can
1250          * skip allocating a TCE table, etc unless we get a non-bridge device.
1251          */
1252         if (!pe->dma_setup_done && !pci_is_bridge(pdev)) {
1253                 switch (phb->type) {
1254                 case PNV_PHB_IODA1:
1255                         pnv_pci_ioda1_setup_dma_pe(phb, pe);
1256                         break;
1257                 case PNV_PHB_IODA2:
1258                         pnv_pci_ioda2_setup_dma_pe(phb, pe);
1259                         break;
1260                 default:
1261                         pr_warn("%s: No DMA for PHB#%x (type %d)\n",
1262                                 __func__, phb->hose->global_number, phb->type);
1263                 }
1264         }
1265
1266         if (pdn)
1267                 pdn->pe_number = pe->pe_number;
1268         pe->device_count++;
1269
1270         WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
1271         pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
1272         set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
1273
1274         /* PEs with a DMA weight of zero won't have a group */
1275         if (pe->table_group.group)
1276                 iommu_add_device(&pe->table_group, &pdev->dev);
1277 }
1278
1279 /*
1280  * Reconfigure TVE#0 to be usable as 64-bit DMA space.
1281  *
1282  * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
1283  * Devices can only access more than that if bit 59 of the PCI address is set
1284  * by hardware, which indicates TVE#1 should be used instead of TVE#0.
1285  * Many PCI devices are not capable of addressing that many bits, and as a
1286  * result are limited to the 4GB of virtual memory made available to 32-bit
1287  * devices in TVE#0.
1288  *
1289  * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
1290  * devices by configuring the virtual memory past the first 4GB inaccessible
1291  * by 64-bit DMAs.  This should only be used by devices that want more than
1292  * 4GB, and only on PEs that have no 32-bit devices.
1293  *
1294  * Currently this will only work on PHB3 (POWER8).
1295  */
1296 static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
1297 {
1298         u64 window_size, table_size, tce_count, addr;
1299         struct page *table_pages;
1300         u64 tce_order = 28; /* 256MB TCEs */
1301         __be64 *tces;
1302         s64 rc;
1303
1304         /*
1305          * Window size needs to be a power of two, but needs to account for
1306          * shifting memory by the 4GB offset required to skip 32bit space.
1307          */
1308         window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
1309         tce_count = window_size >> tce_order;
1310         table_size = tce_count << 3;
1311
1312         if (table_size < PAGE_SIZE)
1313                 table_size = PAGE_SIZE;
1314
1315         table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
1316                                        get_order(table_size));
1317         if (!table_pages)
1318                 goto err;
1319
1320         tces = page_address(table_pages);
1321         if (!tces)
1322                 goto err;
1323
1324         memset(tces, 0, table_size);
1325
1326         for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
1327                 tces[(addr + (1ULL << 32)) >> tce_order] =
1328                         cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
1329         }
1330
1331         rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
1332                                         pe->pe_number,
1333                                         /* reconfigure window 0 */
1334                                         (pe->pe_number << 1) + 0,
1335                                         1,
1336                                         __pa(tces),
1337                                         table_size,
1338                                         1 << tce_order);
1339         if (rc == OPAL_SUCCESS) {
1340                 pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
1341                 return 0;
1342         }
1343 err:
1344         pe_err(pe, "Error configuring 64-bit DMA bypass\n");
1345         return -EIO;
1346 }
1347
1348 static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
1349                 u64 dma_mask)
1350 {
1351         struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
1352         struct pci_dn *pdn = pci_get_pdn(pdev);
1353         struct pnv_ioda_pe *pe;
1354
1355         if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1356                 return false;
1357
1358         pe = &phb->ioda.pe_array[pdn->pe_number];
1359         if (pe->tce_bypass_enabled) {
1360                 u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
1361                 if (dma_mask >= top)
1362                         return true;
1363         }
1364
1365         /*
1366          * If the device can't set the TCE bypass bit but still wants
1367          * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
1368          * bypass the 32-bit region and be usable for 64-bit DMAs.
1369          * The device needs to be able to address all of this space.
1370          */
1371         if (dma_mask >> 32 &&
1372             dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
1373             /* pe->pdev should be set if it's a single device, pe->pbus if not */
1374             (pe->device_count == 1 || !pe->pbus) &&
1375             phb->model == PNV_PHB_MODEL_PHB3) {
1376                 /* Configure the bypass mode */
1377                 s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
1378                 if (rc)
1379                         return false;
1380                 /* 4GB offset bypasses 32-bit space */
1381                 pdev->dev.archdata.dma_offset = (1ULL << 32);
1382                 return true;
1383         }
1384
1385         return false;
1386 }
1387
1388 static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
1389                                                      bool real_mode)
1390 {
1391         return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) :
1392                 (phb->regs + 0x210);
1393 }
1394
1395 static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
1396                 unsigned long index, unsigned long npages, bool rm)
1397 {
1398         struct iommu_table_group_link *tgl = list_first_entry_or_null(
1399                         &tbl->it_group_list, struct iommu_table_group_link,
1400                         next);
1401         struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1402                         struct pnv_ioda_pe, table_group);
1403         __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
1404         unsigned long start, end, inc;
1405
1406         start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
1407         end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
1408                         npages - 1);
1409
1410         /* p7ioc-style invalidation, 2 TCEs per write */
1411         start |= (1ull << 63);
1412         end |= (1ull << 63);
1413         inc = 16;
1414         end |= inc - 1; /* round up end to be different than start */
1415
1416         mb(); /* Ensure above stores are visible */
1417         while (start <= end) {
1418                 if (rm)
1419                         __raw_rm_writeq_be(start, invalidate);
1420                 else
1421                         __raw_writeq_be(start, invalidate);
1422
1423                 start += inc;
1424         }
1425
1426         /*
1427          * The iommu layer will do another mb() for us on build()
1428          * and we don't care on free()
1429          */
1430 }
1431
1432 static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
1433                 long npages, unsigned long uaddr,
1434                 enum dma_data_direction direction,
1435                 unsigned long attrs)
1436 {
1437         int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
1438                         attrs);
1439
1440         if (!ret)
1441                 pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
1442
1443         return ret;
1444 }
1445
1446 #ifdef CONFIG_IOMMU_API
1447 /* Common for IODA1 and IODA2 */
1448 static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
1449                 unsigned long *hpa, enum dma_data_direction *direction,
1450                 bool realmode)
1451 {
1452         return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
1453 }
1454 #endif
1455
1456 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
1457                 long npages)
1458 {
1459         pnv_tce_free(tbl, index, npages);
1460
1461         pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
1462 }
1463
1464 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
1465         .set = pnv_ioda1_tce_build,
1466 #ifdef CONFIG_IOMMU_API
1467         .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
1468         .tce_kill = pnv_pci_p7ioc_tce_invalidate,
1469         .useraddrptr = pnv_tce_useraddrptr,
1470 #endif
1471         .clear = pnv_ioda1_tce_free,
1472         .get = pnv_tce_get,
1473 };
1474
1475 #define PHB3_TCE_KILL_INVAL_ALL         PPC_BIT(0)
1476 #define PHB3_TCE_KILL_INVAL_PE          PPC_BIT(1)
1477 #define PHB3_TCE_KILL_INVAL_ONE         PPC_BIT(2)
1478
1479 static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
1480 {
1481         __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
1482         const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
1483
1484         mb(); /* Ensure previous TCE table stores are visible */
1485         if (rm)
1486                 __raw_rm_writeq_be(val, invalidate);
1487         else
1488                 __raw_writeq_be(val, invalidate);
1489 }
1490
1491 static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
1492 {
1493         /* 01xb - invalidate TCEs that match the specified PE# */
1494         __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
1495         unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
1496
1497         mb(); /* Ensure above stores are visible */
1498         __raw_writeq_be(val, invalidate);
1499 }
1500
1501 static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
1502                                         unsigned shift, unsigned long index,
1503                                         unsigned long npages)
1504 {
1505         __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
1506         unsigned long start, end, inc;
1507
1508         /* We'll invalidate DMA address in PE scope */
1509         start = PHB3_TCE_KILL_INVAL_ONE;
1510         start |= (pe->pe_number & 0xFF);
1511         end = start;
1512
1513         /* Figure out the start, end and step */
1514         start |= (index << shift);
1515         end |= ((index + npages - 1) << shift);
1516         inc = (0x1ull << shift);
1517         mb();
1518
1519         while (start <= end) {
1520                 if (rm)
1521                         __raw_rm_writeq_be(start, invalidate);
1522                 else
1523                         __raw_writeq_be(start, invalidate);
1524                 start += inc;
1525         }
1526 }
1527
1528 static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
1529 {
1530         struct pnv_phb *phb = pe->phb;
1531
1532         if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
1533                 pnv_pci_phb3_tce_invalidate_pe(pe);
1534         else
1535                 opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL_PE,
1536                                   pe->pe_number, 0, 0, 0);
1537 }
1538
1539 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
1540                 unsigned long index, unsigned long npages, bool rm)
1541 {
1542         struct iommu_table_group_link *tgl;
1543
1544         list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
1545                 struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1546                                 struct pnv_ioda_pe, table_group);
1547                 struct pnv_phb *phb = pe->phb;
1548                 unsigned int shift = tbl->it_page_shift;
1549
1550                 /*
1551                  * NVLink1 can use the TCE kill register directly as
1552                  * it's the same as PHB3. NVLink2 is different and
1553                  * should go via the OPAL call.
1554                  */
1555                 if (phb->model == PNV_PHB_MODEL_NPU) {
1556                         /*
1557                          * The NVLink hardware does not support TCE kill
1558                          * per TCE entry so we have to invalidate
1559                          * the entire cache for it.
1560                          */
1561                         pnv_pci_phb3_tce_invalidate_entire(phb, rm);
1562                         continue;
1563                 }
1564                 if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
1565                         pnv_pci_phb3_tce_invalidate(pe, rm, shift,
1566                                                     index, npages);
1567                 else
1568                         opal_pci_tce_kill(phb->opal_id,
1569                                           OPAL_PCI_TCE_KILL_PAGES,
1570                                           pe->pe_number, 1u << shift,
1571                                           index << shift, npages);
1572         }
1573 }
1574
1575 void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
1576 {
1577         if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
1578                 pnv_pci_phb3_tce_invalidate_entire(phb, rm);
1579         else
1580                 opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
1581 }
1582
1583 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
1584                 long npages, unsigned long uaddr,
1585                 enum dma_data_direction direction,
1586                 unsigned long attrs)
1587 {
1588         int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
1589                         attrs);
1590
1591         if (!ret)
1592                 pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
1593
1594         return ret;
1595 }
1596
1597 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
1598                 long npages)
1599 {
1600         pnv_tce_free(tbl, index, npages);
1601
1602         pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
1603 }
1604
1605 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
1606         .set = pnv_ioda2_tce_build,
1607 #ifdef CONFIG_IOMMU_API
1608         .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
1609         .tce_kill = pnv_pci_ioda2_tce_invalidate,
1610         .useraddrptr = pnv_tce_useraddrptr,
1611 #endif
1612         .clear = pnv_ioda2_tce_free,
1613         .get = pnv_tce_get,
1614         .free = pnv_pci_ioda2_table_free_pages,
1615 };
1616
1617 static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
1618 {
1619         unsigned int *weight = (unsigned int *)data;
1620
1621         /* This is quite simplistic. The "base" weight of a device
1622          * is 10. 0 means no DMA is to be accounted for it.
1623          */
1624         if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
1625                 return 0;
1626
1627         if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
1628             dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
1629             dev->class == PCI_CLASS_SERIAL_USB_EHCI)
1630                 *weight += 3;
1631         else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
1632                 *weight += 15;
1633         else
1634                 *weight += 10;
1635
1636         return 0;
1637 }
1638
1639 static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
1640 {
1641         unsigned int weight = 0;
1642
1643         /* SRIOV VF has same DMA32 weight as its PF */
1644 #ifdef CONFIG_PCI_IOV
1645         if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
1646                 pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
1647                 return weight;
1648         }
1649 #endif
1650
1651         if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
1652                 pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
1653         } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
1654                 struct pci_dev *pdev;
1655
1656                 list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
1657                         pnv_pci_ioda_dev_dma_weight(pdev, &weight);
1658         } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
1659                 pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
1660         }
1661
1662         return weight;
1663 }
1664
1665 static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
1666                                        struct pnv_ioda_pe *pe)
1667 {
1668
1669         struct page *tce_mem = NULL;
1670         struct iommu_table *tbl;
1671         unsigned int weight, total_weight = 0;
1672         unsigned int tce32_segsz, base, segs, avail, i;
1673         int64_t rc;
1674         void *addr;
1675
1676         /* XXX FIXME: Handle 64-bit only DMA devices */
1677         /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
1678         /* XXX FIXME: Allocate multi-level tables on PHB3 */
1679         weight = pnv_pci_ioda_pe_dma_weight(pe);
1680         if (!weight)
1681                 return;
1682
1683         pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
1684                      &total_weight);
1685         segs = (weight * phb->ioda.dma32_count) / total_weight;
1686         if (!segs)
1687                 segs = 1;
1688
1689         /*
1690          * Allocate contiguous DMA32 segments. We begin with the expected
1691          * number of segments. With one more attempt, the number of DMA32
1692          * segments to be allocated is decreased by one until one segment
1693          * is allocated successfully.
1694          */
1695         do {
1696                 for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
1697                         for (avail = 0, i = base; i < base + segs; i++) {
1698                                 if (phb->ioda.dma32_segmap[i] ==
1699                                     IODA_INVALID_PE)
1700                                         avail++;
1701                         }
1702
1703                         if (avail == segs)
1704                                 goto found;
1705                 }
1706         } while (--segs);
1707
1708         if (!segs) {
1709                 pe_warn(pe, "No available DMA32 segments\n");
1710                 return;
1711         }
1712
1713 found:
1714         tbl = pnv_pci_table_alloc(phb->hose->node);
1715         if (WARN_ON(!tbl))
1716                 return;
1717
1718         iommu_register_group(&pe->table_group, phb->hose->global_number,
1719                         pe->pe_number);
1720         pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
1721
1722         /* Grab a 32-bit TCE table */
1723         pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
1724                 weight, total_weight, base, segs);
1725         pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
1726                 base * PNV_IODA1_DMA32_SEGSIZE,
1727                 (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
1728
1729         /* XXX Currently, we allocate one big contiguous table for the
1730          * TCEs. We only really need one chunk per 256M of TCE space
1731          * (ie per segment) but that's an optimization for later, it
1732          * requires some added smarts with our get/put_tce implementation
1733          *
1734          * Each TCE page is 4KB in size and each TCE entry occupies 8
1735          * bytes
1736          */
1737         tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
1738         tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
1739                                    get_order(tce32_segsz * segs));
1740         if (!tce_mem) {
1741                 pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
1742                 goto fail;
1743         }
1744         addr = page_address(tce_mem);
1745         memset(addr, 0, tce32_segsz * segs);
1746
1747         /* Configure HW */
1748         for (i = 0; i < segs; i++) {
1749                 rc = opal_pci_map_pe_dma_window(phb->opal_id,
1750                                               pe->pe_number,
1751                                               base + i, 1,
1752                                               __pa(addr) + tce32_segsz * i,
1753                                               tce32_segsz, IOMMU_PAGE_SIZE_4K);
1754                 if (rc) {
1755                         pe_err(pe, " Failed to configure 32-bit TCE table, err %lld\n",
1756                                rc);
1757                         goto fail;
1758                 }
1759         }
1760
1761         /* Setup DMA32 segment mapping */
1762         for (i = base; i < base + segs; i++)
1763                 phb->ioda.dma32_segmap[i] = pe->pe_number;
1764
1765         /* Setup linux iommu table */
1766         pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
1767                                   base * PNV_IODA1_DMA32_SEGSIZE,
1768                                   IOMMU_PAGE_SHIFT_4K);
1769
1770         tbl->it_ops = &pnv_ioda1_iommu_ops;
1771         pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
1772         pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
1773         iommu_init_table(tbl, phb->hose->node, 0, 0);
1774
1775         pe->dma_setup_done = true;
1776         return;
1777  fail:
1778         /* XXX Failure: Try to fallback to 64-bit only ? */
1779         if (tce_mem)
1780                 __free_pages(tce_mem, get_order(tce32_segsz * segs));
1781         if (tbl) {
1782                 pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
1783                 iommu_tce_table_put(tbl);
1784         }
1785 }
1786
1787 static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
1788                 int num, struct iommu_table *tbl)
1789 {
1790         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
1791                         table_group);
1792         struct pnv_phb *phb = pe->phb;
1793         int64_t rc;
1794         const unsigned long size = tbl->it_indirect_levels ?
1795                         tbl->it_level_size : tbl->it_size;
1796         const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
1797         const __u64 win_size = tbl->it_size << tbl->it_page_shift;
1798
1799         pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n",
1800                 num, start_addr, start_addr + win_size - 1,
1801                 IOMMU_PAGE_SIZE(tbl));
1802
1803         /*
1804          * Map TCE table through TVT. The TVE index is the PE number
1805          * shifted by 1 bit for 32-bits DMA space.
1806          */
1807         rc = opal_pci_map_pe_dma_window(phb->opal_id,
1808                         pe->pe_number,
1809                         (pe->pe_number << 1) + num,
1810                         tbl->it_indirect_levels + 1,
1811                         __pa(tbl->it_base),
1812                         size << 3,
1813                         IOMMU_PAGE_SIZE(tbl));
1814         if (rc) {
1815                 pe_err(pe, "Failed to configure TCE table, err %lld\n", rc);
1816                 return rc;
1817         }
1818
1819         pnv_pci_link_table_and_group(phb->hose->node, num,
1820                         tbl, &pe->table_group);
1821         pnv_pci_ioda2_tce_invalidate_pe(pe);
1822
1823         return 0;
1824 }
1825
1826 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
1827 {
1828         uint16_t window_id = (pe->pe_number << 1 ) + 1;
1829         int64_t rc;
1830
1831         pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
1832         if (enable) {
1833                 phys_addr_t top = memblock_end_of_DRAM();
1834
1835                 top = roundup_pow_of_two(top);
1836                 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
1837                                                      pe->pe_number,
1838                                                      window_id,
1839                                                      pe->tce_bypass_base,
1840                                                      top);
1841         } else {
1842                 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
1843                                                      pe->pe_number,
1844                                                      window_id,
1845                                                      pe->tce_bypass_base,
1846                                                      0);
1847         }
1848         if (rc)
1849                 pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
1850         else
1851                 pe->tce_bypass_enabled = enable;
1852 }
1853
1854 static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
1855                 int num, __u32 page_shift, __u64 window_size, __u32 levels,
1856                 bool alloc_userspace_copy, struct iommu_table **ptbl)
1857 {
1858         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
1859                         table_group);
1860         int nid = pe->phb->hose->node;
1861         __u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
1862         long ret;
1863         struct iommu_table *tbl;
1864
1865         tbl = pnv_pci_table_alloc(nid);
1866         if (!tbl)
1867                 return -ENOMEM;
1868
1869         tbl->it_ops = &pnv_ioda2_iommu_ops;
1870
1871         ret = pnv_pci_ioda2_table_alloc_pages(nid,
1872                         bus_offset, page_shift, window_size,
1873                         levels, alloc_userspace_copy, tbl);
1874         if (ret) {
1875                 iommu_tce_table_put(tbl);
1876                 return ret;
1877         }
1878
1879         *ptbl = tbl;
1880
1881         return 0;
1882 }
1883
1884 static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
1885 {
1886         struct iommu_table *tbl = NULL;
1887         long rc;
1888         unsigned long res_start, res_end;
1889
1890         /*
1891          * crashkernel= specifies the kdump kernel's maximum memory at
1892          * some offset and there is no guaranteed the result is a power
1893          * of 2, which will cause errors later.
1894          */
1895         const u64 max_memory = __rounddown_pow_of_two(memory_hotplug_max());
1896
1897         /*
1898          * In memory constrained environments, e.g. kdump kernel, the
1899          * DMA window can be larger than available memory, which will
1900          * cause errors later.
1901          */
1902         const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
1903
1904         /*
1905          * We create the default window as big as we can. The constraint is
1906          * the max order of allocation possible. The TCE table is likely to
1907          * end up being multilevel and with on-demand allocation in place,
1908          * the initial use is not going to be huge as the default window aims
1909          * to support crippled devices (i.e. not fully 64bit DMAble) only.
1910          */
1911         /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
1912         const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
1913         /* Each TCE level cannot exceed maxblock so go multilevel if needed */
1914         unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT);
1915         unsigned long tcelevel_order = ilog2(maxblock >> 3);
1916         unsigned int levels = tces_order / tcelevel_order;
1917
1918         if (tces_order % tcelevel_order)
1919                 levels += 1;
1920         /*
1921          * We try to stick to default levels (which is >1 at the moment) in
1922          * order to save memory by relying on on-demain TCE level allocation.
1923          */
1924         levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
1925
1926         rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
1927                         window_size, levels, false, &tbl);
1928         if (rc) {
1929                 pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
1930                                 rc);
1931                 return rc;
1932         }
1933
1934         /* We use top part of 32bit space for MMIO so exclude it from DMA */
1935         res_start = 0;
1936         res_end = 0;
1937         if (window_size > pe->phb->ioda.m32_pci_base) {
1938                 res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
1939                 res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
1940         }
1941         iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
1942
1943         rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
1944         if (rc) {
1945                 pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
1946                                 rc);
1947                 iommu_tce_table_put(tbl);
1948                 return rc;
1949         }
1950
1951         if (!pnv_iommu_bypass_disabled)
1952                 pnv_pci_ioda2_set_bypass(pe, true);
1953
1954         /*
1955          * Set table base for the case of IOMMU DMA use. Usually this is done
1956          * from dma_dev_setup() which is not called when a device is returned
1957          * from VFIO so do it here.
1958          */
1959         if (pe->pdev)
1960                 set_iommu_table_base(&pe->pdev->dev, tbl);
1961
1962         return 0;
1963 }
1964
1965 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
1966                 int num)
1967 {
1968         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
1969                         table_group);
1970         struct pnv_phb *phb = pe->phb;
1971         long ret;
1972
1973         pe_info(pe, "Removing DMA window #%d\n", num);
1974
1975         ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
1976                         (pe->pe_number << 1) + num,
1977                         0/* levels */, 0/* table address */,
1978                         0/* table size */, 0/* page size */);
1979         if (ret)
1980                 pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
1981         else
1982                 pnv_pci_ioda2_tce_invalidate_pe(pe);
1983
1984         pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
1985
1986         return ret;
1987 }
1988
1989 #ifdef CONFIG_IOMMU_API
1990 unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
1991                 __u64 window_size, __u32 levels)
1992 {
1993         unsigned long bytes = 0;
1994         const unsigned window_shift = ilog2(window_size);
1995         unsigned entries_shift = window_shift - page_shift;
1996         unsigned table_shift = entries_shift + 3;
1997         unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
1998         unsigned long direct_table_size;
1999
2000         if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
2001                         !is_power_of_2(window_size))
2002                 return 0;
2003
2004         /* Calculate a direct table size from window_size and levels */
2005         entries_shift = (entries_shift + levels - 1) / levels;
2006         table_shift = entries_shift + 3;
2007         table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
2008         direct_table_size =  1UL << table_shift;
2009
2010         for ( ; levels; --levels) {
2011                 bytes += ALIGN(tce_table_size, direct_table_size);
2012
2013                 tce_table_size /= direct_table_size;
2014                 tce_table_size <<= 3;
2015                 tce_table_size = max_t(unsigned long,
2016                                 tce_table_size, direct_table_size);
2017         }
2018
2019         return bytes + bytes; /* one for HW table, one for userspace copy */
2020 }
2021
2022 static long pnv_pci_ioda2_create_table_userspace(
2023                 struct iommu_table_group *table_group,
2024                 int num, __u32 page_shift, __u64 window_size, __u32 levels,
2025                 struct iommu_table **ptbl)
2026 {
2027         long ret = pnv_pci_ioda2_create_table(table_group,
2028                         num, page_shift, window_size, levels, true, ptbl);
2029
2030         if (!ret)
2031                 (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
2032                                 page_shift, window_size, levels);
2033         return ret;
2034 }
2035
2036 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
2037 {
2038         struct pci_dev *dev;
2039
2040         list_for_each_entry(dev, &bus->devices, bus_list) {
2041                 set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
2042                 dev->dev.archdata.dma_offset = pe->tce_bypass_base;
2043
2044                 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
2045                         pnv_ioda_setup_bus_dma(pe, dev->subordinate);
2046         }
2047 }
2048
2049 static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
2050 {
2051         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2052                                                 table_group);
2053         /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
2054         struct iommu_table *tbl = pe->table_group.tables[0];
2055
2056         pnv_pci_ioda2_set_bypass(pe, false);
2057         pnv_pci_ioda2_unset_window(&pe->table_group, 0);
2058         if (pe->pbus)
2059                 pnv_ioda_setup_bus_dma(pe, pe->pbus);
2060         else if (pe->pdev)
2061                 set_iommu_table_base(&pe->pdev->dev, NULL);
2062         iommu_tce_table_put(tbl);
2063 }
2064
2065 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
2066 {
2067         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2068                                                 table_group);
2069
2070         pnv_pci_ioda2_setup_default_config(pe);
2071         if (pe->pbus)
2072                 pnv_ioda_setup_bus_dma(pe, pe->pbus);
2073 }
2074
2075 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
2076         .get_table_size = pnv_pci_ioda2_get_table_size,
2077         .create_table = pnv_pci_ioda2_create_table_userspace,
2078         .set_window = pnv_pci_ioda2_set_window,
2079         .unset_window = pnv_pci_ioda2_unset_window,
2080         .take_ownership = pnv_ioda2_take_ownership,
2081         .release_ownership = pnv_ioda2_release_ownership,
2082 };
2083 #endif
2084
2085 void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
2086                                 struct pnv_ioda_pe *pe)
2087 {
2088         int64_t rc;
2089
2090         /* TVE #1 is selected by PCI address bit 59 */
2091         pe->tce_bypass_base = 1ull << 59;
2092
2093         /* The PE will reserve all possible 32-bits space */
2094         pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
2095                 phb->ioda.m32_pci_base);
2096
2097         /* Setup linux iommu table */
2098         pe->table_group.tce32_start = 0;
2099         pe->table_group.tce32_size = phb->ioda.m32_pci_base;
2100         pe->table_group.max_dynamic_windows_supported =
2101                         IOMMU_TABLE_GROUP_MAX_TABLES;
2102         pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
2103         pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
2104
2105         rc = pnv_pci_ioda2_setup_default_config(pe);
2106         if (rc)
2107                 return;
2108
2109 #ifdef CONFIG_IOMMU_API
2110         pe->table_group.ops = &pnv_pci_ioda2_ops;
2111         iommu_register_group(&pe->table_group, phb->hose->global_number,
2112                              pe->pe_number);
2113 #endif
2114         pe->dma_setup_done = true;
2115 }
2116
2117 int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
2118 {
2119         struct pnv_phb *phb = container_of(chip, struct pnv_phb,
2120                                            ioda.irq_chip);
2121
2122         return opal_pci_msi_eoi(phb->opal_id, hw_irq);
2123 }
2124
2125 static void pnv_ioda2_msi_eoi(struct irq_data *d)
2126 {
2127         int64_t rc;
2128         unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
2129         struct irq_chip *chip = irq_data_get_irq_chip(d);
2130
2131         rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
2132         WARN_ON_ONCE(rc);
2133
2134         icp_native_eoi(d);
2135 }
2136
2137
2138 void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
2139 {
2140         struct irq_data *idata;
2141         struct irq_chip *ichip;
2142
2143         /* The MSI EOI OPAL call is only needed on PHB3 */
2144         if (phb->model != PNV_PHB_MODEL_PHB3)
2145                 return;
2146
2147         if (!phb->ioda.irq_chip_init) {
2148                 /*
2149                  * First time we setup an MSI IRQ, we need to setup the
2150                  * corresponding IRQ chip to route correctly.
2151                  */
2152                 idata = irq_get_irq_data(virq);
2153                 ichip = irq_data_get_irq_chip(idata);
2154                 phb->ioda.irq_chip_init = 1;
2155                 phb->ioda.irq_chip = *ichip;
2156                 phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
2157         }
2158         irq_set_chip(virq, &phb->ioda.irq_chip);
2159 }
2160
2161 /*
2162  * Returns true iff chip is something that we could call
2163  * pnv_opal_pci_msi_eoi for.
2164  */
2165 bool is_pnv_opal_msi(struct irq_chip *chip)
2166 {
2167         return chip->irq_eoi == pnv_ioda2_msi_eoi;
2168 }
2169 EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
2170
2171 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
2172                                   unsigned int hwirq, unsigned int virq,
2173                                   unsigned int is_64, struct msi_msg *msg)
2174 {
2175         struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
2176         unsigned int xive_num = hwirq - phb->msi_base;
2177         __be32 data;
2178         int rc;
2179
2180         /* No PE assigned ? bail out ... no MSI for you ! */
2181         if (pe == NULL)
2182                 return -ENXIO;
2183
2184         /* Check if we have an MVE */
2185         if (pe->mve_number < 0)
2186                 return -ENXIO;
2187
2188         /* Force 32-bit MSI on some broken devices */
2189         if (dev->no_64bit_msi)
2190                 is_64 = 0;
2191
2192         /* Assign XIVE to PE */
2193         rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
2194         if (rc) {
2195                 pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
2196                         pci_name(dev), rc, xive_num);
2197                 return -EIO;
2198         }
2199
2200         if (is_64) {
2201                 __be64 addr64;
2202
2203                 rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
2204                                      &addr64, &data);
2205                 if (rc) {
2206                         pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
2207                                 pci_name(dev), rc);
2208                         return -EIO;
2209                 }
2210                 msg->address_hi = be64_to_cpu(addr64) >> 32;
2211                 msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
2212         } else {
2213                 __be32 addr32;
2214
2215                 rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
2216                                      &addr32, &data);
2217                 if (rc) {
2218                         pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
2219                                 pci_name(dev), rc);
2220                         return -EIO;
2221                 }
2222                 msg->address_hi = 0;
2223                 msg->address_lo = be32_to_cpu(addr32);
2224         }
2225         msg->data = be32_to_cpu(data);
2226
2227         pnv_set_msi_irq_chip(phb, virq);
2228
2229         pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
2230                  " address=%x_%08x data=%x PE# %x\n",
2231                  pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
2232                  msg->address_hi, msg->address_lo, data, pe->pe_number);
2233
2234         return 0;
2235 }
2236
2237 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
2238 {
2239         unsigned int count;
2240         const __be32 *prop = of_get_property(phb->hose->dn,
2241                                              "ibm,opal-msi-ranges", NULL);
2242         if (!prop) {
2243                 /* BML Fallback */
2244                 prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
2245         }
2246         if (!prop)
2247                 return;
2248
2249         phb->msi_base = be32_to_cpup(prop);
2250         count = be32_to_cpup(prop + 1);
2251         if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
2252                 pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
2253                        phb->hose->global_number);
2254                 return;
2255         }
2256
2257         phb->msi_setup = pnv_pci_ioda_msi_setup;
2258         phb->msi32_support = 1;
2259         pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
2260                 count, phb->msi_base);
2261 }
2262
2263 static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
2264                                   struct resource *res)
2265 {
2266         struct pnv_phb *phb = pe->phb;
2267         struct pci_bus_region region;
2268         int index;
2269         int64_t rc;
2270
2271         if (!res || !res->flags || res->start > res->end)
2272                 return;
2273
2274         if (res->flags & IORESOURCE_IO) {
2275                 region.start = res->start - phb->ioda.io_pci_base;
2276                 region.end   = res->end - phb->ioda.io_pci_base;
2277                 index = region.start / phb->ioda.io_segsize;
2278
2279                 while (index < phb->ioda.total_pe_num &&
2280                        region.start <= region.end) {
2281                         phb->ioda.io_segmap[index] = pe->pe_number;
2282                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2283                                 pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
2284                         if (rc != OPAL_SUCCESS) {
2285                                 pr_err("%s: Error %lld mapping IO segment#%d to PE#%x\n",
2286                                        __func__, rc, index, pe->pe_number);
2287                                 break;
2288                         }
2289
2290                         region.start += phb->ioda.io_segsize;
2291                         index++;
2292                 }
2293         } else if ((res->flags & IORESOURCE_MEM) &&
2294                    !pnv_pci_is_m64(phb, res)) {
2295                 region.start = res->start -
2296                                phb->hose->mem_offset[0] -
2297                                phb->ioda.m32_pci_base;
2298                 region.end   = res->end -
2299                                phb->hose->mem_offset[0] -
2300                                phb->ioda.m32_pci_base;
2301                 index = region.start / phb->ioda.m32_segsize;
2302
2303                 while (index < phb->ioda.total_pe_num &&
2304                        region.start <= region.end) {
2305                         phb->ioda.m32_segmap[index] = pe->pe_number;
2306                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2307                                 pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
2308                         if (rc != OPAL_SUCCESS) {
2309                                 pr_err("%s: Error %lld mapping M32 segment#%d to PE#%x",
2310                                        __func__, rc, index, pe->pe_number);
2311                                 break;
2312                         }
2313
2314                         region.start += phb->ioda.m32_segsize;
2315                         index++;
2316                 }
2317         }
2318 }
2319
2320 /*
2321  * This function is supposed to be called on basis of PE from top
2322  * to bottom style. So the the I/O or MMIO segment assigned to
2323  * parent PE could be overridden by its child PEs if necessary.
2324  */
2325 static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
2326 {
2327         struct pci_dev *pdev;
2328         int i;
2329
2330         /*
2331          * NOTE: We only care PCI bus based PE for now. For PCI
2332          * device based PE, for example SRIOV sensitive VF should
2333          * be figured out later.
2334          */
2335         BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
2336
2337         list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
2338                 for (i = 0; i <= PCI_ROM_RESOURCE; i++)
2339                         pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
2340
2341                 /*
2342                  * If the PE contains all subordinate PCI buses, the
2343                  * windows of the child bridges should be mapped to
2344                  * the PE as well.
2345                  */
2346                 if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
2347                         continue;
2348                 for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
2349                         pnv_ioda_setup_pe_res(pe,
2350                                 &pdev->resource[PCI_BRIDGE_RESOURCES + i]);
2351         }
2352 }
2353
2354 #ifdef CONFIG_DEBUG_FS
2355 static int pnv_pci_diag_data_set(void *data, u64 val)
2356 {
2357         struct pnv_phb *phb = data;
2358         s64 ret;
2359
2360         /* Retrieve the diag data from firmware */
2361         ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
2362                                           phb->diag_data_size);
2363         if (ret != OPAL_SUCCESS)
2364                 return -EIO;
2365
2366         /* Print the diag data to the kernel log */
2367         pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
2368         return 0;
2369 }
2370
2371 DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, pnv_pci_diag_data_set,
2372                          "%llu\n");
2373
2374 static int pnv_pci_ioda_pe_dump(void *data, u64 val)
2375 {
2376         struct pnv_phb *phb = data;
2377         int pe_num;
2378
2379         for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
2380                 struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_num];
2381
2382                 if (!test_bit(pe_num, phb->ioda.pe_alloc))
2383                         continue;
2384
2385                 pe_warn(pe, "rid: %04x dev count: %2d flags: %s%s%s%s%s%s\n",
2386                         pe->rid, pe->device_count,
2387                         (pe->flags & PNV_IODA_PE_DEV) ? "dev " : "",
2388                         (pe->flags & PNV_IODA_PE_BUS) ? "bus " : "",
2389                         (pe->flags & PNV_IODA_PE_BUS_ALL) ? "all " : "",
2390                         (pe->flags & PNV_IODA_PE_MASTER) ? "master " : "",
2391                         (pe->flags & PNV_IODA_PE_SLAVE) ? "slave " : "",
2392                         (pe->flags & PNV_IODA_PE_VF) ? "vf " : "");
2393         }
2394
2395         return 0;
2396 }
2397
2398 DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_ioda_pe_dump_fops, NULL,
2399                          pnv_pci_ioda_pe_dump, "%llu\n");
2400
2401 #endif /* CONFIG_DEBUG_FS */
2402
2403 static void pnv_pci_ioda_create_dbgfs(void)
2404 {
2405 #ifdef CONFIG_DEBUG_FS
2406         struct pci_controller *hose, *tmp;
2407         struct pnv_phb *phb;
2408         char name[16];
2409
2410         list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
2411                 phb = hose->private_data;
2412
2413                 /* Notify initialization of PHB done */
2414                 phb->initialized = 1;
2415
2416                 sprintf(name, "PCI%04x", hose->global_number);
2417                 phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
2418
2419                 debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs,
2420                                            phb, &pnv_pci_diag_data_fops);
2421                 debugfs_create_file_unsafe("dump_ioda_pe_state", 0200, phb->dbgfs,
2422                                            phb, &pnv_pci_ioda_pe_dump_fops);
2423         }
2424 #endif /* CONFIG_DEBUG_FS */
2425 }
2426
2427 static void pnv_pci_enable_bridge(struct pci_bus *bus)
2428 {
2429         struct pci_dev *dev = bus->self;
2430         struct pci_bus *child;
2431
2432         /* Empty bus ? bail */
2433         if (list_empty(&bus->devices))
2434                 return;
2435
2436         /*
2437          * If there's a bridge associated with that bus enable it. This works
2438          * around races in the generic code if the enabling is done during
2439          * parallel probing. This can be removed once those races have been
2440          * fixed.
2441          */
2442         if (dev) {
2443                 int rc = pci_enable_device(dev);
2444                 if (rc)
2445                         pci_err(dev, "Error enabling bridge (%d)\n", rc);
2446                 pci_set_master(dev);
2447         }
2448
2449         /* Perform the same to child busses */
2450         list_for_each_entry(child, &bus->children, node)
2451                 pnv_pci_enable_bridge(child);
2452 }
2453
2454 static void pnv_pci_enable_bridges(void)
2455 {
2456         struct pci_controller *hose;
2457
2458         list_for_each_entry(hose, &hose_list, list_node)
2459                 pnv_pci_enable_bridge(hose->bus);
2460 }
2461
2462 static void pnv_pci_ioda_fixup(void)
2463 {
2464         pnv_pci_ioda_setup_nvlink();
2465         pnv_pci_ioda_create_dbgfs();
2466
2467         pnv_pci_enable_bridges();
2468
2469 #ifdef CONFIG_EEH
2470         pnv_eeh_post_init();
2471 #endif
2472 }
2473
2474 /*
2475  * Returns the alignment for I/O or memory windows for P2P
2476  * bridges. That actually depends on how PEs are segmented.
2477  * For now, we return I/O or M32 segment size for PE sensitive
2478  * P2P bridges. Otherwise, the default values (4KiB for I/O,
2479  * 1MiB for memory) will be returned.
2480  *
2481  * The current PCI bus might be put into one PE, which was
2482  * create against the parent PCI bridge. For that case, we
2483  * needn't enlarge the alignment so that we can save some
2484  * resources.
2485  */
2486 static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
2487                                                 unsigned long type)
2488 {
2489         struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
2490         int num_pci_bridges = 0;
2491         struct pci_dev *bridge;
2492
2493         bridge = bus->self;
2494         while (bridge) {
2495                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
2496                         num_pci_bridges++;
2497                         if (num_pci_bridges >= 2)
2498                                 return 1;
2499                 }
2500
2501                 bridge = bridge->bus->self;
2502         }
2503
2504         /*
2505          * We fall back to M32 if M64 isn't supported. We enforce the M64
2506          * alignment for any 64-bit resource, PCIe doesn't care and
2507          * bridges only do 64-bit prefetchable anyway.
2508          */
2509         if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type))
2510                 return phb->ioda.m64_segsize;
2511         if (type & IORESOURCE_MEM)
2512                 return phb->ioda.m32_segsize;
2513
2514         return phb->ioda.io_segsize;
2515 }
2516
2517 /*
2518  * We are updating root port or the upstream port of the
2519  * bridge behind the root port with PHB's windows in order
2520  * to accommodate the changes on required resources during
2521  * PCI (slot) hotplug, which is connected to either root
2522  * port or the downstream ports of PCIe switch behind the
2523  * root port.
2524  */
2525 static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
2526                                            unsigned long type)
2527 {
2528         struct pci_controller *hose = pci_bus_to_host(bus);
2529         struct pnv_phb *phb = hose->private_data;
2530         struct pci_dev *bridge = bus->self;
2531         struct resource *r, *w;
2532         bool msi_region = false;
2533         int i;
2534
2535         /* Check if we need apply fixup to the bridge's windows */
2536         if (!pci_is_root_bus(bridge->bus) &&
2537             !pci_is_root_bus(bridge->bus->self->bus))
2538                 return;
2539
2540         /* Fixup the resources */
2541         for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
2542                 r = &bridge->resource[PCI_BRIDGE_RESOURCES + i];
2543                 if (!r->flags || !r->parent)
2544                         continue;
2545
2546                 w = NULL;
2547                 if (r->flags & type & IORESOURCE_IO)
2548                         w = &hose->io_resource;
2549                 else if (pnv_pci_is_m64(phb, r) &&
2550                          (type & IORESOURCE_PREFETCH) &&
2551                          phb->ioda.m64_segsize)
2552                         w = &hose->mem_resources[1];
2553                 else if (r->flags & type & IORESOURCE_MEM) {
2554                         w = &hose->mem_resources[0];
2555                         msi_region = true;
2556                 }
2557
2558                 r->start = w->start;
2559                 r->end = w->end;
2560
2561                 /* The 64KB 32-bits MSI region shouldn't be included in
2562                  * the 32-bits bridge window. Otherwise, we can see strange
2563                  * issues. One of them is EEH error observed on Garrison.
2564                  *
2565                  * Exclude top 1MB region which is the minimal alignment of
2566                  * 32-bits bridge window.
2567                  */
2568                 if (msi_region) {
2569                         r->end += 0x10000;
2570                         r->end -= 0x100000;
2571                 }
2572         }
2573 }
2574
2575 static void pnv_pci_configure_bus(struct pci_bus *bus)
2576 {
2577         struct pci_dev *bridge = bus->self;
2578         struct pnv_ioda_pe *pe;
2579         bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
2580
2581         dev_info(&bus->dev, "Configuring PE for bus\n");
2582
2583         /* Don't assign PE to PCI bus, which doesn't have subordinate devices */
2584         if (WARN_ON(list_empty(&bus->devices)))
2585                 return;
2586
2587         /* Reserve PEs according to used M64 resources */
2588         pnv_ioda_reserve_m64_pe(bus, NULL, all);
2589
2590         /*
2591          * Assign PE. We might run here because of partial hotplug.
2592          * For the case, we just pick up the existing PE and should
2593          * not allocate resources again.
2594          */
2595         pe = pnv_ioda_setup_bus_PE(bus, all);
2596         if (!pe)
2597                 return;
2598
2599         pnv_ioda_setup_pe_seg(pe);
2600 }
2601
2602 static resource_size_t pnv_pci_default_alignment(void)
2603 {
2604         return PAGE_SIZE;
2605 }
2606
2607 /* Prevent enabling devices for which we couldn't properly
2608  * assign a PE
2609  */
2610 static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
2611 {
2612         struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
2613         struct pci_dn *pdn;
2614
2615         /* The function is probably called while the PEs have
2616          * not be created yet. For example, resource reassignment
2617          * during PCI probe period. We just skip the check if
2618          * PEs isn't ready.
2619          */
2620         if (!phb->initialized)
2621                 return true;
2622
2623         pdn = pci_get_pdn(dev);
2624         if (!pdn || pdn->pe_number == IODA_INVALID_PE)
2625                 return false;
2626
2627         return true;
2628 }
2629
2630 static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev)
2631 {
2632         struct pci_controller *hose = pci_bus_to_host(dev->bus);
2633         struct pnv_phb *phb = hose->private_data;
2634         struct pci_dn *pdn;
2635         struct pnv_ioda_pe *pe;
2636
2637         if (!phb->initialized)
2638                 return true;
2639
2640         pdn = pci_get_pdn(dev);
2641         if (!pdn)
2642                 return false;
2643
2644         if (pdn->pe_number == IODA_INVALID_PE) {
2645                 pe = pnv_ioda_setup_dev_PE(dev);
2646                 if (!pe)
2647                         return false;
2648         }
2649         return true;
2650 }
2651
2652 static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
2653                                        int num)
2654 {
2655         struct pnv_ioda_pe *pe = container_of(table_group,
2656                                               struct pnv_ioda_pe, table_group);
2657         struct pnv_phb *phb = pe->phb;
2658         unsigned int idx;
2659         long rc;
2660
2661         pe_info(pe, "Removing DMA window #%d\n", num);
2662         for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
2663                 if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
2664                         continue;
2665
2666                 rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
2667                                                 idx, 0, 0ul, 0ul, 0ul);
2668                 if (rc != OPAL_SUCCESS) {
2669                         pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
2670                                 rc, idx);
2671                         return rc;
2672                 }
2673
2674                 phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
2675         }
2676
2677         pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
2678         return OPAL_SUCCESS;
2679 }
2680
2681 static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
2682 {
2683         struct iommu_table *tbl = pe->table_group.tables[0];
2684         int64_t rc;
2685
2686         if (!pe->dma_setup_done)
2687                 return;
2688
2689         rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
2690         if (rc != OPAL_SUCCESS)
2691                 return;
2692
2693         pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
2694         if (pe->table_group.group) {
2695                 iommu_group_put(pe->table_group.group);
2696                 WARN_ON(pe->table_group.group);
2697         }
2698
2699         free_pages(tbl->it_base, get_order(tbl->it_size << 3));
2700         iommu_tce_table_put(tbl);
2701 }
2702
2703 void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
2704 {
2705         struct iommu_table *tbl = pe->table_group.tables[0];
2706         int64_t rc;
2707
2708         if (pe->dma_setup_done)
2709                 return;
2710
2711         rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
2712         if (rc)
2713                 pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
2714
2715         pnv_pci_ioda2_set_bypass(pe, false);
2716         if (pe->table_group.group) {
2717                 iommu_group_put(pe->table_group.group);
2718                 WARN_ON(pe->table_group.group);
2719         }
2720
2721         iommu_tce_table_put(tbl);
2722 }
2723
2724 static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
2725                                  unsigned short win,
2726                                  unsigned int *map)
2727 {
2728         struct pnv_phb *phb = pe->phb;
2729         int idx;
2730         int64_t rc;
2731
2732         for (idx = 0; idx < phb->ioda.total_pe_num; idx++) {
2733                 if (map[idx] != pe->pe_number)
2734                         continue;
2735
2736                 rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2737                                 phb->ioda.reserved_pe_idx, win, 0, idx);
2738
2739                 if (rc != OPAL_SUCCESS)
2740                         pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
2741                                 rc, win, idx);
2742
2743                 map[idx] = IODA_INVALID_PE;
2744         }
2745 }
2746
2747 static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
2748 {
2749         struct pnv_phb *phb = pe->phb;
2750
2751         if (phb->type == PNV_PHB_IODA1) {
2752                 pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
2753                                      phb->ioda.io_segmap);
2754                 pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
2755                                      phb->ioda.m32_segmap);
2756                 /* M64 is pre-configured by pnv_ioda1_init_m64() */
2757         } else if (phb->type == PNV_PHB_IODA2) {
2758                 pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
2759                                      phb->ioda.m32_segmap);
2760         }
2761 }
2762
2763 static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
2764 {
2765         struct pnv_phb *phb = pe->phb;
2766         struct pnv_ioda_pe *slave, *tmp;
2767
2768         pe_info(pe, "Releasing PE\n");
2769
2770         mutex_lock(&phb->ioda.pe_list_mutex);
2771         list_del(&pe->list);
2772         mutex_unlock(&phb->ioda.pe_list_mutex);
2773
2774         switch (phb->type) {
2775         case PNV_PHB_IODA1:
2776                 pnv_pci_ioda1_release_pe_dma(pe);
2777                 break;
2778         case PNV_PHB_IODA2:
2779                 pnv_pci_ioda2_release_pe_dma(pe);
2780                 break;
2781         case PNV_PHB_NPU_OCAPI:
2782                 break;
2783         default:
2784                 WARN_ON(1);
2785         }
2786
2787         pnv_ioda_release_pe_seg(pe);
2788         pnv_ioda_deconfigure_pe(pe->phb, pe);
2789
2790         /* Release slave PEs in the compound PE */
2791         if (pe->flags & PNV_IODA_PE_MASTER) {
2792                 list_for_each_entry_safe(slave, tmp, &pe->slaves, list) {
2793                         list_del(&slave->list);
2794                         pnv_ioda_free_pe(slave);
2795                 }
2796         }
2797
2798         /*
2799          * The PE for root bus can be removed because of hotplug in EEH
2800          * recovery for fenced PHB error. We need to mark the PE dead so
2801          * that it can be populated again in PCI hot add path. The PE
2802          * shouldn't be destroyed as it's the global reserved resource.
2803          */
2804         if (phb->ioda.root_pe_idx == pe->pe_number)
2805                 return;
2806
2807         pnv_ioda_free_pe(pe);
2808 }
2809
2810 static void pnv_pci_release_device(struct pci_dev *pdev)
2811 {
2812         struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
2813         struct pci_dn *pdn = pci_get_pdn(pdev);
2814         struct pnv_ioda_pe *pe;
2815
2816         /* The VF PE state is torn down when sriov_disable() is called */
2817         if (pdev->is_virtfn)
2818                 return;
2819
2820         if (!pdn || pdn->pe_number == IODA_INVALID_PE)
2821                 return;
2822
2823 #ifdef CONFIG_PCI_IOV
2824         /*
2825          * FIXME: Try move this to sriov_disable(). It's here since we allocate
2826          * the iov state at probe time since we need to fiddle with the IOV
2827          * resources.
2828          */
2829         if (pdev->is_physfn)
2830                 kfree(pdev->dev.archdata.iov_data);
2831 #endif
2832
2833         /*
2834          * PCI hotplug can happen as part of EEH error recovery. The @pdn
2835          * isn't removed and added afterwards in this scenario. We should
2836          * set the PE number in @pdn to an invalid one. Otherwise, the PE's
2837          * device count is decreased on removing devices while failing to
2838          * be increased on adding devices. It leads to unbalanced PE's device
2839          * count and eventually make normal PCI hotplug path broken.
2840          */
2841         pe = &phb->ioda.pe_array[pdn->pe_number];
2842         pdn->pe_number = IODA_INVALID_PE;
2843
2844         WARN_ON(--pe->device_count < 0);
2845         if (pe->device_count == 0)
2846                 pnv_ioda_release_pe(pe);
2847 }
2848
2849 static void pnv_npu_disable_device(struct pci_dev *pdev)
2850 {
2851         struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
2852         struct eeh_pe *eehpe = edev ? edev->pe : NULL;
2853
2854         if (eehpe && eeh_ops && eeh_ops->reset)
2855                 eeh_ops->reset(eehpe, EEH_RESET_HOT);
2856 }
2857
2858 static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
2859 {
2860         struct pnv_phb *phb = hose->private_data;
2861
2862         opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
2863                        OPAL_ASSERT_RESET);
2864 }
2865
2866 static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus)
2867 {
2868         struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
2869         struct pnv_ioda_pe *pe;
2870
2871         list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2872                 if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
2873                         continue;
2874
2875                 if (!pe->pbus)
2876                         continue;
2877
2878                 if (bus->number == ((pe->rid >> 8) & 0xFF)) {
2879                         pe->pbus = bus;
2880                         break;
2881                 }
2882         }
2883 }
2884
2885 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
2886         .dma_dev_setup          = pnv_pci_ioda_dma_dev_setup,
2887         .dma_bus_setup          = pnv_pci_ioda_dma_bus_setup,
2888         .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
2889         .setup_msi_irqs         = pnv_setup_msi_irqs,
2890         .teardown_msi_irqs      = pnv_teardown_msi_irqs,
2891         .enable_device_hook     = pnv_pci_enable_device_hook,
2892         .release_device         = pnv_pci_release_device,
2893         .window_alignment       = pnv_pci_window_alignment,
2894         .setup_bridge           = pnv_pci_fixup_bridge_resources,
2895         .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
2896         .shutdown               = pnv_pci_ioda_shutdown,
2897 };
2898
2899 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
2900         .setup_msi_irqs         = pnv_setup_msi_irqs,
2901         .teardown_msi_irqs      = pnv_teardown_msi_irqs,
2902         .enable_device_hook     = pnv_pci_enable_device_hook,
2903         .window_alignment       = pnv_pci_window_alignment,
2904         .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
2905         .shutdown               = pnv_pci_ioda_shutdown,
2906         .disable_device         = pnv_npu_disable_device,
2907 };
2908
2909 static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
2910         .enable_device_hook     = pnv_ocapi_enable_device_hook,
2911         .release_device         = pnv_pci_release_device,
2912         .window_alignment       = pnv_pci_window_alignment,
2913         .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
2914         .shutdown               = pnv_pci_ioda_shutdown,
2915 };
2916
2917 static void __init pnv_pci_init_ioda_phb(struct device_node *np,
2918                                          u64 hub_id, int ioda_type)
2919 {
2920         struct pci_controller *hose;
2921         struct pnv_phb *phb;
2922         unsigned long size, m64map_off, m32map_off, pemap_off;
2923         unsigned long iomap_off = 0, dma32map_off = 0;
2924         struct pnv_ioda_pe *root_pe;
2925         struct resource r;
2926         const __be64 *prop64;
2927         const __be32 *prop32;
2928         int len;
2929         unsigned int segno;
2930         u64 phb_id;
2931         void *aux;
2932         long rc;
2933
2934         if (!of_device_is_available(np))
2935                 return;
2936
2937         pr_info("Initializing %s PHB (%pOF)\n", pnv_phb_names[ioda_type], np);
2938
2939         prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
2940         if (!prop64) {
2941                 pr_err("  Missing \"ibm,opal-phbid\" property !\n");
2942                 return;
2943         }
2944         phb_id = be64_to_cpup(prop64);
2945         pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
2946
2947         phb = memblock_alloc(sizeof(*phb), SMP_CACHE_BYTES);
2948         if (!phb)
2949                 panic("%s: Failed to allocate %zu bytes\n", __func__,
2950                       sizeof(*phb));
2951
2952         /* Allocate PCI controller */
2953         phb->hose = hose = pcibios_alloc_controller(np);
2954         if (!phb->hose) {
2955                 pr_err("  Can't allocate PCI controller for %pOF\n",
2956                        np);
2957                 memblock_free(__pa(phb), sizeof(struct pnv_phb));
2958                 return;
2959         }
2960
2961         spin_lock_init(&phb->lock);
2962         prop32 = of_get_property(np, "bus-range", &len);
2963         if (prop32 && len == 8) {
2964                 hose->first_busno = be32_to_cpu(prop32[0]);
2965                 hose->last_busno = be32_to_cpu(prop32[1]);
2966         } else {
2967                 pr_warn("  Broken <bus-range> on %pOF\n", np);
2968                 hose->first_busno = 0;
2969                 hose->last_busno = 0xff;
2970         }
2971         hose->private_data = phb;
2972         phb->hub_id = hub_id;
2973         phb->opal_id = phb_id;
2974         phb->type = ioda_type;
2975         mutex_init(&phb->ioda.pe_alloc_mutex);
2976
2977         /* Detect specific models for error handling */
2978         if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
2979                 phb->model = PNV_PHB_MODEL_P7IOC;
2980         else if (of_device_is_compatible(np, "ibm,power8-pciex"))
2981                 phb->model = PNV_PHB_MODEL_PHB3;
2982         else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
2983                 phb->model = PNV_PHB_MODEL_NPU;
2984         else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
2985                 phb->model = PNV_PHB_MODEL_NPU2;
2986         else
2987                 phb->model = PNV_PHB_MODEL_UNKNOWN;
2988
2989         /* Initialize diagnostic data buffer */
2990         prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL);
2991         if (prop32)
2992                 phb->diag_data_size = be32_to_cpup(prop32);
2993         else
2994                 phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
2995
2996         phb->diag_data = memblock_alloc(phb->diag_data_size, SMP_CACHE_BYTES);
2997         if (!phb->diag_data)
2998                 panic("%s: Failed to allocate %u bytes\n", __func__,
2999                       phb->diag_data_size);
3000
3001         /* Parse 32-bit and IO ranges (if any) */
3002         pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
3003
3004         /* Get registers */
3005         if (!of_address_to_resource(np, 0, &r)) {
3006                 phb->regs_phys = r.start;
3007                 phb->regs = ioremap(r.start, resource_size(&r));
3008                 if (phb->regs == NULL)
3009                         pr_err("  Failed to map registers !\n");
3010         }
3011
3012         /* Initialize more IODA stuff */
3013         phb->ioda.total_pe_num = 1;
3014         prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
3015         if (prop32)
3016                 phb->ioda.total_pe_num = be32_to_cpup(prop32);
3017         prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
3018         if (prop32)
3019                 phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
3020
3021         /* Invalidate RID to PE# mapping */
3022         for (segno = 0; segno < ARRAY_SIZE(phb->ioda.pe_rmap); segno++)
3023                 phb->ioda.pe_rmap[segno] = IODA_INVALID_PE;
3024
3025         /* Parse 64-bit MMIO range */
3026         pnv_ioda_parse_m64_window(phb);
3027
3028         phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
3029         /* FW Has already off top 64k of M32 space (MSI space) */
3030         phb->ioda.m32_size += 0x10000;
3031
3032         phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
3033         phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
3034         phb->ioda.io_size = hose->pci_io_size;
3035         phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
3036         phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
3037
3038         /* Calculate how many 32-bit TCE segments we have */
3039         phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3040                                 PNV_IODA1_DMA32_SEGSIZE;
3041
3042         /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
3043         size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
3044                         sizeof(unsigned long));
3045         m64map_off = size;
3046         size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
3047         m32map_off = size;
3048         size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
3049         if (phb->type == PNV_PHB_IODA1) {
3050                 iomap_off = size;
3051                 size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
3052                 dma32map_off = size;
3053                 size += phb->ioda.dma32_count *
3054                         sizeof(phb->ioda.dma32_segmap[0]);
3055         }
3056         pemap_off = size;
3057         size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
3058         aux = memblock_alloc(size, SMP_CACHE_BYTES);
3059         if (!aux)
3060                 panic("%s: Failed to allocate %lu bytes\n", __func__, size);
3061         phb->ioda.pe_alloc = aux;
3062         phb->ioda.m64_segmap = aux + m64map_off;
3063         phb->ioda.m32_segmap = aux + m32map_off;
3064         for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
3065                 phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
3066                 phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
3067         }
3068         if (phb->type == PNV_PHB_IODA1) {
3069                 phb->ioda.io_segmap = aux + iomap_off;
3070                 for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
3071                         phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
3072
3073                 phb->ioda.dma32_segmap = aux + dma32map_off;
3074                 for (segno = 0; segno < phb->ioda.dma32_count; segno++)
3075                         phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
3076         }
3077         phb->ioda.pe_array = aux + pemap_off;
3078
3079         /*
3080          * Choose PE number for root bus, which shouldn't have
3081          * M64 resources consumed by its child devices. To pick
3082          * the PE number adjacent to the reserved one if possible.
3083          */
3084         pnv_ioda_reserve_pe(phb, phb->ioda.reserved_pe_idx);
3085         if (phb->ioda.reserved_pe_idx == 0) {
3086                 phb->ioda.root_pe_idx = 1;
3087                 pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3088         } else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) {
3089                 phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
3090                 pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3091         } else {
3092                 /* otherwise just allocate one */
3093                 root_pe = pnv_ioda_alloc_pe(phb, 1);
3094                 phb->ioda.root_pe_idx = root_pe->pe_number;
3095         }
3096
3097         INIT_LIST_HEAD(&phb->ioda.pe_list);
3098         mutex_init(&phb->ioda.pe_list_mutex);
3099
3100         /* Calculate how many 32-bit TCE segments we have */
3101         phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3102                                 PNV_IODA1_DMA32_SEGSIZE;
3103
3104 #if 0 /* We should really do that ... */
3105         rc = opal_pci_set_phb_mem_window(opal->phb_id,
3106                                          window_type,
3107                                          window_num,
3108                                          starting_real_address,
3109                                          starting_pci_address,
3110                                          segment_size);
3111 #endif
3112
3113         pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
3114                 phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
3115                 phb->ioda.m32_size, phb->ioda.m32_segsize);
3116         if (phb->ioda.m64_size)
3117                 pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
3118                         phb->ioda.m64_size, phb->ioda.m64_segsize);
3119         if (phb->ioda.io_size)
3120                 pr_info("                  IO: 0x%x [segment=0x%x]\n",
3121                         phb->ioda.io_size, phb->ioda.io_segsize);
3122
3123
3124         phb->hose->ops = &pnv_pci_ops;
3125         phb->get_pe_state = pnv_ioda_get_pe_state;
3126         phb->freeze_pe = pnv_ioda_freeze_pe;
3127         phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
3128
3129         /* Setup MSI support */
3130         pnv_pci_init_ioda_msis(phb);
3131
3132         /*
3133          * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
3134          * to let the PCI core do resource assignment. It's supposed
3135          * that the PCI core will do correct I/O and MMIO alignment
3136          * for the P2P bridge bars so that each PCI bus (excluding
3137          * the child P2P bridges) can form individual PE.
3138          */
3139         ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
3140
3141         switch (phb->type) {
3142         case PNV_PHB_NPU_NVLINK:
3143                 hose->controller_ops = pnv_npu_ioda_controller_ops;
3144                 break;
3145         case PNV_PHB_NPU_OCAPI:
3146                 hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
3147                 break;
3148         default:
3149                 hose->controller_ops = pnv_pci_ioda_controller_ops;
3150         }
3151
3152         ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
3153
3154 #ifdef CONFIG_PCI_IOV
3155         ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov;
3156         ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
3157         ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
3158         ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
3159 #endif
3160
3161         pci_add_flags(PCI_REASSIGN_ALL_RSRC);
3162
3163         /* Reset IODA tables to a clean state */
3164         rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
3165         if (rc)
3166                 pr_warn("  OPAL Error %ld performing IODA table reset !\n", rc);
3167
3168         /*
3169          * If we're running in kdump kernel, the previous kernel never
3170          * shutdown PCI devices correctly. We already got IODA table
3171          * cleaned out. So we have to issue PHB reset to stop all PCI
3172          * transactions from previous kernel. The ppc_pci_reset_phbs
3173          * kernel parameter will force this reset too. Additionally,
3174          * if the IODA reset above failed then use a bigger hammer.
3175          * This can happen if we get a PHB fatal error in very early
3176          * boot.
3177          */
3178         if (is_kdump_kernel() || pci_reset_phbs || rc) {
3179                 pr_info("  Issue PHB reset ...\n");
3180                 pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
3181                 pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
3182         }
3183
3184         /* Remove M64 resource if we can't configure it successfully */
3185         if (!phb->init_m64 || phb->init_m64(phb))
3186                 hose->mem_resources[1].flags = 0;
3187 }
3188
3189 void __init pnv_pci_init_ioda2_phb(struct device_node *np)
3190 {
3191         pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
3192 }
3193
3194 void __init pnv_pci_init_npu_phb(struct device_node *np)
3195 {
3196         pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
3197 }
3198
3199 void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
3200 {
3201         pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
3202 }
3203
3204 static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
3205 {
3206         struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
3207
3208         if (!machine_is(powernv))
3209                 return;
3210
3211         if (phb->type == PNV_PHB_NPU_OCAPI)
3212                 dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
3213 }
3214 DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup);
3215
3216 void __init pnv_pci_init_ioda_hub(struct device_node *np)
3217 {
3218         struct device_node *phbn;
3219         const __be64 *prop64;
3220         u64 hub_id;
3221
3222         pr_info("Probing IODA IO-Hub %pOF\n", np);
3223
3224         prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
3225         if (!prop64) {
3226                 pr_err(" Missing \"ibm,opal-hubid\" property !\n");
3227                 return;
3228         }
3229         hub_id = be64_to_cpup(prop64);
3230         pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
3231
3232         /* Count child PHBs */
3233         for_each_child_of_node(np, phbn) {
3234                 /* Look for IODA1 PHBs */
3235                 if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
3236                         pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
3237         }
3238 }