~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/powerpc/platforms/powernv/pci-ioda.c

Version: ~ [ linux-5.8 ] ~ [ linux-5.7.12 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.55 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.136 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.191 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.232 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.232 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * Support PCI/PCIe on PowerNV platforms
  4  *
  5  * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
  6  */
  7 
  8 #undef DEBUG
  9 
 10 #include <linux/kernel.h>
 11 #include <linux/pci.h>
 12 #include <linux/crash_dump.h>
 13 #include <linux/delay.h>
 14 #include <linux/string.h>
 15 #include <linux/init.h>
 16 #include <linux/memblock.h>
 17 #include <linux/irq.h>
 18 #include <linux/io.h>
 19 #include <linux/msi.h>
 20 #include <linux/iommu.h>
 21 #include <linux/rculist.h>
 22 #include <linux/sizes.h>
 23 
 24 #include <asm/sections.h>
 25 #include <asm/io.h>
 26 #include <asm/prom.h>
 27 #include <asm/pci-bridge.h>
 28 #include <asm/machdep.h>
 29 #include <asm/msi_bitmap.h>
 30 #include <asm/ppc-pci.h>
 31 #include <asm/opal.h>
 32 #include <asm/iommu.h>
 33 #include <asm/tce.h>
 34 #include <asm/xics.h>
 35 #include <asm/debugfs.h>
 36 #include <asm/firmware.h>
 37 #include <asm/pnv-pci.h>
 38 #include <asm/mmzone.h>
 39 
 40 #include <misc/cxl-base.h>
 41 
 42 #include "powernv.h"
 43 #include "pci.h"
 44 #include "../../../../drivers/pci/pci.h"
 45 
 46 #define PNV_IODA1_M64_NUM       16      /* Number of M64 BARs   */
 47 #define PNV_IODA1_M64_SEGS      8       /* Segments per M64 BAR */
 48 #define PNV_IODA1_DMA32_SEGSIZE 0x10000000
 49 
 50 static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
 51                                               "NPU_OCAPI" };
 52 
 53 void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 54                             const char *fmt, ...)
 55 {
 56         struct va_format vaf;
 57         va_list args;
 58         char pfix[32];
 59 
 60         va_start(args, fmt);
 61 
 62         vaf.fmt = fmt;
 63         vaf.va = &args;
 64 
 65         if (pe->flags & PNV_IODA_PE_DEV)
 66                 strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
 67         else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 68                 sprintf(pfix, "%04x:%02x     ",
 69                         pci_domain_nr(pe->pbus), pe->pbus->number);
 70 #ifdef CONFIG_PCI_IOV
 71         else if (pe->flags & PNV_IODA_PE_VF)
 72                 sprintf(pfix, "%04x:%02x:%2x.%d",
 73                         pci_domain_nr(pe->parent_dev->bus),
 74                         (pe->rid & 0xff00) >> 8,
 75                         PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
 76 #endif /* CONFIG_PCI_IOV*/
 77 
 78         printk("%spci %s: [PE# %.2x] %pV",
 79                level, pfix, pe->pe_number, &vaf);
 80 
 81         va_end(args);
 82 }
 83 
 84 static bool pnv_iommu_bypass_disabled __read_mostly;
 85 static bool pci_reset_phbs __read_mostly;
 86 
 87 static int __init iommu_setup(char *str)
 88 {
 89         if (!str)
 90                 return -EINVAL;
 91 
 92         while (*str) {
 93                 if (!strncmp(str, "nobypass", 8)) {
 94                         pnv_iommu_bypass_disabled = true;
 95                         pr_info("PowerNV: IOMMU bypass window disabled.\n");
 96                         break;
 97                 }
 98                 str += strcspn(str, ",");
 99                 if (*str == ',')
100                         str++;
101         }
102 
103         return 0;
104 }
105 early_param("iommu", iommu_setup);
106 
107 static int __init pci_reset_phbs_setup(char *str)
108 {
109         pci_reset_phbs = true;
110         return 0;
111 }
112 
113 early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
114 
115 static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
116 {
117         /*
118          * WARNING: We cannot rely on the resource flags. The Linux PCI
119          * allocation code sometimes decides to put a 64-bit prefetchable
120          * BAR in the 32-bit window, so we have to compare the addresses.
121          *
122          * For simplicity we only test resource start.
123          */
124         return (r->start >= phb->ioda.m64_base &&
125                 r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
126 }
127 
128 static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
129 {
130         unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
131 
132         return (resource_flags & flags) == flags;
133 }
134 
135 static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
136 {
137         s64 rc;
138 
139         phb->ioda.pe_array[pe_no].phb = phb;
140         phb->ioda.pe_array[pe_no].pe_number = pe_no;
141 
142         /*
143          * Clear the PE frozen state as it might be put into frozen state
144          * in the last PCI remove path. It's not harmful to do so when the
145          * PE is already in unfrozen state.
146          */
147         rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
148                                        OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
149         if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
150                 pr_warn("%s: Error %lld unfreezing PHB#%x-PE#%x\n",
151                         __func__, rc, phb->hose->global_number, pe_no);
152 
153         return &phb->ioda.pe_array[pe_no];
154 }
155 
156 static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
157 {
158         if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
159                 pr_warn("%s: Invalid PE %x on PHB#%x\n",
160                         __func__, pe_no, phb->hose->global_number);
161                 return;
162         }
163 
164         if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
165                 pr_debug("%s: PE %x was reserved on PHB#%x\n",
166                          __func__, pe_no, phb->hose->global_number);
167 
168         pnv_ioda_init_pe(phb, pe_no);
169 }
170 
171 static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
172 {
173         long pe;
174 
175         for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
176                 if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
177                         return pnv_ioda_init_pe(phb, pe);
178         }
179 
180         return NULL;
181 }
182 
183 static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
184 {
185         struct pnv_phb *phb = pe->phb;
186         unsigned int pe_num = pe->pe_number;
187 
188         WARN_ON(pe->pdev);
189         WARN_ON(pe->npucomp); /* NPUs are not supposed to be freed */
190         kfree(pe->npucomp);
191         memset(pe, 0, sizeof(struct pnv_ioda_pe));
192         clear_bit(pe_num, phb->ioda.pe_alloc);
193 }
194 
195 /* The default M64 BAR is shared by all PEs */
196 static int pnv_ioda2_init_m64(struct pnv_phb *phb)
197 {
198         const char *desc;
199         struct resource *r;
200         s64 rc;
201 
202         /* Configure the default M64 BAR */
203         rc = opal_pci_set_phb_mem_window(phb->opal_id,
204                                          OPAL_M64_WINDOW_TYPE,
205                                          phb->ioda.m64_bar_idx,
206                                          phb->ioda.m64_base,
207                                          0, /* unused */
208                                          phb->ioda.m64_size);
209         if (rc != OPAL_SUCCESS) {
210                 desc = "configuring";
211                 goto fail;
212         }
213 
214         /* Enable the default M64 BAR */
215         rc = opal_pci_phb_mmio_enable(phb->opal_id,
216                                       OPAL_M64_WINDOW_TYPE,
217                                       phb->ioda.m64_bar_idx,
218                                       OPAL_ENABLE_M64_SPLIT);
219         if (rc != OPAL_SUCCESS) {
220                 desc = "enabling";
221                 goto fail;
222         }
223 
224         /*
225          * Exclude the segments for reserved and root bus PE, which
226          * are first or last two PEs.
227          */
228         r = &phb->hose->mem_resources[1];
229         if (phb->ioda.reserved_pe_idx == 0)
230                 r->start += (2 * phb->ioda.m64_segsize);
231         else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
232                 r->end -= (2 * phb->ioda.m64_segsize);
233         else
234                 pr_warn("  Cannot strip M64 segment for reserved PE#%x\n",
235                         phb->ioda.reserved_pe_idx);
236 
237         return 0;
238 
239 fail:
240         pr_warn("  Failure %lld %s M64 BAR#%d\n",
241                 rc, desc, phb->ioda.m64_bar_idx);
242         opal_pci_phb_mmio_enable(phb->opal_id,
243                                  OPAL_M64_WINDOW_TYPE,
244                                  phb->ioda.m64_bar_idx,
245                                  OPAL_DISABLE_M64);
246         return -EIO;
247 }
248 
249 static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
250                                          unsigned long *pe_bitmap)
251 {
252         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
253         struct pnv_phb *phb = hose->private_data;
254         struct resource *r;
255         resource_size_t base, sgsz, start, end;
256         int segno, i;
257 
258         base = phb->ioda.m64_base;
259         sgsz = phb->ioda.m64_segsize;
260         for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
261                 r = &pdev->resource[i];
262                 if (!r->parent || !pnv_pci_is_m64(phb, r))
263                         continue;
264 
265                 start = _ALIGN_DOWN(r->start - base, sgsz);
266                 end = _ALIGN_UP(r->end - base, sgsz);
267                 for (segno = start / sgsz; segno < end / sgsz; segno++) {
268                         if (pe_bitmap)
269                                 set_bit(segno, pe_bitmap);
270                         else
271                                 pnv_ioda_reserve_pe(phb, segno);
272                 }
273         }
274 }
275 
276 static int pnv_ioda1_init_m64(struct pnv_phb *phb)
277 {
278         struct resource *r;
279         int index;
280 
281         /*
282          * There are 16 M64 BARs, each of which has 8 segments. So
283          * there are as many M64 segments as the maximum number of
284          * PEs, which is 128.
285          */
286         for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
287                 unsigned long base, segsz = phb->ioda.m64_segsize;
288                 int64_t rc;
289 
290                 base = phb->ioda.m64_base +
291                        index * PNV_IODA1_M64_SEGS * segsz;
292                 rc = opal_pci_set_phb_mem_window(phb->opal_id,
293                                 OPAL_M64_WINDOW_TYPE, index, base, 0,
294                                 PNV_IODA1_M64_SEGS * segsz);
295                 if (rc != OPAL_SUCCESS) {
296                         pr_warn("  Error %lld setting M64 PHB#%x-BAR#%d\n",
297                                 rc, phb->hose->global_number, index);
298                         goto fail;
299                 }
300 
301                 rc = opal_pci_phb_mmio_enable(phb->opal_id,
302                                 OPAL_M64_WINDOW_TYPE, index,
303                                 OPAL_ENABLE_M64_SPLIT);
304                 if (rc != OPAL_SUCCESS) {
305                         pr_warn("  Error %lld enabling M64 PHB#%x-BAR#%d\n",
306                                 rc, phb->hose->global_number, index);
307                         goto fail;
308                 }
309         }
310 
311         /*
312          * Exclude the segments for reserved and root bus PE, which
313          * are first or last two PEs.
314          */
315         r = &phb->hose->mem_resources[1];
316         if (phb->ioda.reserved_pe_idx == 0)
317                 r->start += (2 * phb->ioda.m64_segsize);
318         else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
319                 r->end -= (2 * phb->ioda.m64_segsize);
320         else
321                 WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
322                      phb->ioda.reserved_pe_idx, phb->hose->global_number);
323 
324         return 0;
325 
326 fail:
327         for ( ; index >= 0; index--)
328                 opal_pci_phb_mmio_enable(phb->opal_id,
329                         OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
330 
331         return -EIO;
332 }
333 
334 static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
335                                     unsigned long *pe_bitmap,
336                                     bool all)
337 {
338         struct pci_dev *pdev;
339 
340         list_for_each_entry(pdev, &bus->devices, bus_list) {
341                 pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap);
342 
343                 if (all && pdev->subordinate)
344                         pnv_ioda_reserve_m64_pe(pdev->subordinate,
345                                                 pe_bitmap, all);
346         }
347 }
348 
349 static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
350 {
351         struct pci_controller *hose = pci_bus_to_host(bus);
352         struct pnv_phb *phb = hose->private_data;
353         struct pnv_ioda_pe *master_pe, *pe;
354         unsigned long size, *pe_alloc;
355         int i;
356 
357         /* Root bus shouldn't use M64 */
358         if (pci_is_root_bus(bus))
359                 return NULL;
360 
361         /* Allocate bitmap */
362         size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
363         pe_alloc = kzalloc(size, GFP_KERNEL);
364         if (!pe_alloc) {
365                 pr_warn("%s: Out of memory !\n",
366                         __func__);
367                 return NULL;
368         }
369 
370         /* Figure out reserved PE numbers by the PE */
371         pnv_ioda_reserve_m64_pe(bus, pe_alloc, all);
372 
373         /*
374          * the current bus might not own M64 window and that's all
375          * contributed by its child buses. For the case, we needn't
376          * pick M64 dependent PE#.
377          */
378         if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) {
379                 kfree(pe_alloc);
380                 return NULL;
381         }
382 
383         /*
384          * Figure out the master PE and put all slave PEs to master
385          * PE's list to form compound PE.
386          */
387         master_pe = NULL;
388         i = -1;
389         while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) <
390                 phb->ioda.total_pe_num) {
391                 pe = &phb->ioda.pe_array[i];
392 
393                 phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number;
394                 if (!master_pe) {
395                         pe->flags |= PNV_IODA_PE_MASTER;
396                         INIT_LIST_HEAD(&pe->slaves);
397                         master_pe = pe;
398                 } else {
399                         pe->flags |= PNV_IODA_PE_SLAVE;
400                         pe->master = master_pe;
401                         list_add_tail(&pe->list, &master_pe->slaves);
402                 }
403 
404                 /*
405                  * P7IOC supports M64DT, which helps mapping M64 segment
406                  * to one particular PE#. However, PHB3 has fixed mapping
407                  * between M64 segment and PE#. In order to have same logic
408                  * for P7IOC and PHB3, we enforce fixed mapping between M64
409                  * segment and PE# on P7IOC.
410                  */
411                 if (phb->type == PNV_PHB_IODA1) {
412                         int64_t rc;
413 
414                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
415                                         pe->pe_number, OPAL_M64_WINDOW_TYPE,
416                                         pe->pe_number / PNV_IODA1_M64_SEGS,
417                                         pe->pe_number % PNV_IODA1_M64_SEGS);
418                         if (rc != OPAL_SUCCESS)
419                                 pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
420                                         __func__, rc, phb->hose->global_number,
421                                         pe->pe_number);
422                 }
423         }
424 
425         kfree(pe_alloc);
426         return master_pe;
427 }
428 
429 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
430 {
431         struct pci_controller *hose = phb->hose;
432         struct device_node *dn = hose->dn;
433         struct resource *res;
434         u32 m64_range[2], i;
435         const __be32 *r;
436         u64 pci_addr;
437 
438         if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
439                 pr_info("  Not support M64 window\n");
440                 return;
441         }
442 
443         if (!firmware_has_feature(FW_FEATURE_OPAL)) {
444                 pr_info("  Firmware too old to support M64 window\n");
445                 return;
446         }
447 
448         r = of_get_property(dn, "ibm,opal-m64-window", NULL);
449         if (!r) {
450                 pr_info("  No <ibm,opal-m64-window> on %pOF\n",
451                         dn);
452                 return;
453         }
454 
455         /*
456          * Find the available M64 BAR range and pickup the last one for
457          * covering the whole 64-bits space. We support only one range.
458          */
459         if (of_property_read_u32_array(dn, "ibm,opal-available-m64-ranges",
460                                        m64_range, 2)) {
461                 /* In absence of the property, assume 0..15 */
462                 m64_range[0] = 0;
463                 m64_range[1] = 16;
464         }
465         /* We only support 64 bits in our allocator */
466         if (m64_range[1] > 63) {
467                 pr_warn("%s: Limiting M64 range to 63 (from %d) on PHB#%x\n",
468                         __func__, m64_range[1], phb->hose->global_number);
469                 m64_range[1] = 63;
470         }
471         /* Empty range, no m64 */
472         if (m64_range[1] <= m64_range[0]) {
473                 pr_warn("%s: M64 empty, disabling M64 usage on PHB#%x\n",
474                         __func__, phb->hose->global_number);
475                 return;
476         }
477 
478         /* Configure M64 informations */
479         res = &hose->mem_resources[1];
480         res->name = dn->full_name;
481         res->start = of_translate_address(dn, r + 2);
482         res->end = res->start + of_read_number(r + 4, 2) - 1;
483         res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
484         pci_addr = of_read_number(r, 2);
485         hose->mem_offset[1] = res->start - pci_addr;
486 
487         phb->ioda.m64_size = resource_size(res);
488         phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num;
489         phb->ioda.m64_base = pci_addr;
490 
491         /* This lines up nicely with the display from processing OF ranges */
492         pr_info(" MEM 0x%016llx..0x%016llx -> 0x%016llx (M64 #%d..%d)\n",
493                 res->start, res->end, pci_addr, m64_range[0],
494                 m64_range[0] + m64_range[1] - 1);
495 
496         /* Mark all M64 used up by default */
497         phb->ioda.m64_bar_alloc = (unsigned long)-1;
498 
499         /* Use last M64 BAR to cover M64 window */
500         m64_range[1]--;
501         phb->ioda.m64_bar_idx = m64_range[0] + m64_range[1];
502 
503         pr_info(" Using M64 #%d as default window\n", phb->ioda.m64_bar_idx);
504 
505         /* Mark remaining ones free */
506         for (i = m64_range[0]; i < m64_range[1]; i++)
507                 clear_bit(i, &phb->ioda.m64_bar_alloc);
508 
509         /*
510          * Setup init functions for M64 based on IODA version, IODA3 uses
511          * the IODA2 code.
512          */
513         if (phb->type == PNV_PHB_IODA1)
514                 phb->init_m64 = pnv_ioda1_init_m64;
515         else
516                 phb->init_m64 = pnv_ioda2_init_m64;
517 }
518 
519 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
520 {
521         struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
522         struct pnv_ioda_pe *slave;
523         s64 rc;
524 
525         /* Fetch master PE */
526         if (pe->flags & PNV_IODA_PE_SLAVE) {
527                 pe = pe->master;
528                 if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
529                         return;
530 
531                 pe_no = pe->pe_number;
532         }
533 
534         /* Freeze master PE */
535         rc = opal_pci_eeh_freeze_set(phb->opal_id,
536                                      pe_no,
537                                      OPAL_EEH_ACTION_SET_FREEZE_ALL);
538         if (rc != OPAL_SUCCESS) {
539                 pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
540                         __func__, rc, phb->hose->global_number, pe_no);
541                 return;
542         }
543 
544         /* Freeze slave PEs */
545         if (!(pe->flags & PNV_IODA_PE_MASTER))
546                 return;
547 
548         list_for_each_entry(slave, &pe->slaves, list) {
549                 rc = opal_pci_eeh_freeze_set(phb->opal_id,
550                                              slave->pe_number,
551                                              OPAL_EEH_ACTION_SET_FREEZE_ALL);
552                 if (rc != OPAL_SUCCESS)
553                         pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
554                                 __func__, rc, phb->hose->global_number,
555                                 slave->pe_number);
556         }
557 }
558 
559 static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
560 {
561         struct pnv_ioda_pe *pe, *slave;
562         s64 rc;
563 
564         /* Find master PE */
565         pe = &phb->ioda.pe_array[pe_no];
566         if (pe->flags & PNV_IODA_PE_SLAVE) {
567                 pe = pe->master;
568                 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
569                 pe_no = pe->pe_number;
570         }
571 
572         /* Clear frozen state for master PE */
573         rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
574         if (rc != OPAL_SUCCESS) {
575                 pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
576                         __func__, rc, opt, phb->hose->global_number, pe_no);
577                 return -EIO;
578         }
579 
580         if (!(pe->flags & PNV_IODA_PE_MASTER))
581                 return 0;
582 
583         /* Clear frozen state for slave PEs */
584         list_for_each_entry(slave, &pe->slaves, list) {
585                 rc = opal_pci_eeh_freeze_clear(phb->opal_id,
586                                              slave->pe_number,
587                                              opt);
588                 if (rc != OPAL_SUCCESS) {
589                         pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
590                                 __func__, rc, opt, phb->hose->global_number,
591                                 slave->pe_number);
592                         return -EIO;
593                 }
594         }
595 
596         return 0;
597 }
598 
599 static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
600 {
601         struct pnv_ioda_pe *slave, *pe;
602         u8 fstate = 0, state;
603         __be16 pcierr = 0;
604         s64 rc;
605 
606         /* Sanity check on PE number */
607         if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num)
608                 return OPAL_EEH_STOPPED_PERM_UNAVAIL;
609 
610         /*
611          * Fetch the master PE and the PE instance might be
612          * not initialized yet.
613          */
614         pe = &phb->ioda.pe_array[pe_no];
615         if (pe->flags & PNV_IODA_PE_SLAVE) {
616                 pe = pe->master;
617                 WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
618                 pe_no = pe->pe_number;
619         }
620 
621         /* Check the master PE */
622         rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
623                                         &state, &pcierr, NULL);
624         if (rc != OPAL_SUCCESS) {
625                 pr_warn("%s: Failure %lld getting "
626                         "PHB#%x-PE#%x state\n",
627                         __func__, rc,
628                         phb->hose->global_number, pe_no);
629                 return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
630         }
631 
632         /* Check the slave PE */
633         if (!(pe->flags & PNV_IODA_PE_MASTER))
634                 return state;
635 
636         list_for_each_entry(slave, &pe->slaves, list) {
637                 rc = opal_pci_eeh_freeze_status(phb->opal_id,
638                                                 slave->pe_number,
639                                                 &fstate,
640                                                 &pcierr,
641                                                 NULL);
642                 if (rc != OPAL_SUCCESS) {
643                         pr_warn("%s: Failure %lld getting "
644                                 "PHB#%x-PE#%x state\n",
645                                 __func__, rc,
646                                 phb->hose->global_number, slave->pe_number);
647                         return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
648                 }
649 
650                 /*
651                  * Override the result based on the ascending
652                  * priority.
653                  */
654                 if (fstate > state)
655                         state = fstate;
656         }
657 
658         return state;
659 }
660 
661 struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
662 {
663         struct pci_controller *hose = pci_bus_to_host(dev->bus);
664         struct pnv_phb *phb = hose->private_data;
665         struct pci_dn *pdn = pci_get_pdn(dev);
666 
667         if (!pdn)
668                 return NULL;
669         if (pdn->pe_number == IODA_INVALID_PE)
670                 return NULL;
671         return &phb->ioda.pe_array[pdn->pe_number];
672 }
673 
674 static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
675                                   struct pnv_ioda_pe *parent,
676                                   struct pnv_ioda_pe *child,
677                                   bool is_add)
678 {
679         const char *desc = is_add ? "adding" : "removing";
680         uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
681                               OPAL_REMOVE_PE_FROM_DOMAIN;
682         struct pnv_ioda_pe *slave;
683         long rc;
684 
685         /* Parent PE affects child PE */
686         rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
687                                 child->pe_number, op);
688         if (rc != OPAL_SUCCESS) {
689                 pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
690                         rc, desc);
691                 return -ENXIO;
692         }
693 
694         if (!(child->flags & PNV_IODA_PE_MASTER))
695                 return 0;
696 
697         /* Compound case: parent PE affects slave PEs */
698         list_for_each_entry(slave, &child->slaves, list) {
699                 rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
700                                         slave->pe_number, op);
701                 if (rc != OPAL_SUCCESS) {
702                         pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
703                                 rc, desc);
704                         return -ENXIO;
705                 }
706         }
707 
708         return 0;
709 }
710 
711 static int pnv_ioda_set_peltv(struct pnv_phb *phb,
712                               struct pnv_ioda_pe *pe,
713                               bool is_add)
714 {
715         struct pnv_ioda_pe *slave;
716         struct pci_dev *pdev = NULL;
717         int ret;
718 
719         /*
720          * Clear PE frozen state. If it's master PE, we need
721          * clear slave PE frozen state as well.
722          */
723         if (is_add) {
724                 opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
725                                           OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
726                 if (pe->flags & PNV_IODA_PE_MASTER) {
727                         list_for_each_entry(slave, &pe->slaves, list)
728                                 opal_pci_eeh_freeze_clear(phb->opal_id,
729                                                           slave->pe_number,
730                                                           OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
731                 }
732         }
733 
734         /*
735          * Associate PE in PELT. We need add the PE into the
736          * corresponding PELT-V as well. Otherwise, the error
737          * originated from the PE might contribute to other
738          * PEs.
739          */
740         ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
741         if (ret)
742                 return ret;
743 
744         /* For compound PEs, any one affects all of them */
745         if (pe->flags & PNV_IODA_PE_MASTER) {
746                 list_for_each_entry(slave, &pe->slaves, list) {
747                         ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
748                         if (ret)
749                                 return ret;
750                 }
751         }
752 
753         if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
754                 pdev = pe->pbus->self;
755         else if (pe->flags & PNV_IODA_PE_DEV)
756                 pdev = pe->pdev->bus->self;
757 #ifdef CONFIG_PCI_IOV
758         else if (pe->flags & PNV_IODA_PE_VF)
759                 pdev = pe->parent_dev;
760 #endif /* CONFIG_PCI_IOV */
761         while (pdev) {
762                 struct pci_dn *pdn = pci_get_pdn(pdev);
763                 struct pnv_ioda_pe *parent;
764 
765                 if (pdn && pdn->pe_number != IODA_INVALID_PE) {
766                         parent = &phb->ioda.pe_array[pdn->pe_number];
767                         ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
768                         if (ret)
769                                 return ret;
770                 }
771 
772                 pdev = pdev->bus->self;
773         }
774 
775         return 0;
776 }
777 
778 static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
779 {
780         struct pci_dev *parent;
781         uint8_t bcomp, dcomp, fcomp;
782         int64_t rc;
783         long rid_end, rid;
784 
785         /* Currently, we just deconfigure VF PE. Bus PE will always there.*/
786         if (pe->pbus) {
787                 int count;
788 
789                 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
790                 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
791                 parent = pe->pbus->self;
792                 if (pe->flags & PNV_IODA_PE_BUS_ALL)
793                         count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
794                 else
795                         count = 1;
796 
797                 switch(count) {
798                 case  1: bcomp = OpalPciBusAll;         break;
799                 case  2: bcomp = OpalPciBus7Bits;       break;
800                 case  4: bcomp = OpalPciBus6Bits;       break;
801                 case  8: bcomp = OpalPciBus5Bits;       break;
802                 case 16: bcomp = OpalPciBus4Bits;       break;
803                 case 32: bcomp = OpalPciBus3Bits;       break;
804                 default:
805                         dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
806                                 count);
807                         /* Do an exact match only */
808                         bcomp = OpalPciBusAll;
809                 }
810                 rid_end = pe->rid + (count << 8);
811         } else {
812 #ifdef CONFIG_PCI_IOV
813                 if (pe->flags & PNV_IODA_PE_VF)
814                         parent = pe->parent_dev;
815                 else
816 #endif
817                         parent = pe->pdev->bus->self;
818                 bcomp = OpalPciBusAll;
819                 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
820                 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
821                 rid_end = pe->rid + 1;
822         }
823 
824         /* Clear the reverse map */
825         for (rid = pe->rid; rid < rid_end; rid++)
826                 phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
827 
828         /* Release from all parents PELT-V */
829         while (parent) {
830                 struct pci_dn *pdn = pci_get_pdn(parent);
831                 if (pdn && pdn->pe_number != IODA_INVALID_PE) {
832                         rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
833                                                 pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
834                         /* XXX What to do in case of error ? */
835                 }
836                 parent = parent->bus->self;
837         }
838 
839         opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
840                                   OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
841 
842         /* Disassociate PE in PELT */
843         rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
844                                 pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
845         if (rc)
846                 pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
847         rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
848                              bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
849         if (rc)
850                 pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc);
851 
852         pe->pbus = NULL;
853         pe->pdev = NULL;
854 #ifdef CONFIG_PCI_IOV
855         pe->parent_dev = NULL;
856 #endif
857 
858         return 0;
859 }
860 
861 static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
862 {
863         struct pci_dev *parent;
864         uint8_t bcomp, dcomp, fcomp;
865         long rc, rid_end, rid;
866 
867         /* Bus validation ? */
868         if (pe->pbus) {
869                 int count;
870 
871                 dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
872                 fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
873                 parent = pe->pbus->self;
874                 if (pe->flags & PNV_IODA_PE_BUS_ALL)
875                         count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
876                 else
877                         count = 1;
878 
879                 switch(count) {
880                 case  1: bcomp = OpalPciBusAll;         break;
881                 case  2: bcomp = OpalPciBus7Bits;       break;
882                 case  4: bcomp = OpalPciBus6Bits;       break;
883                 case  8: bcomp = OpalPciBus5Bits;       break;
884                 case 16: bcomp = OpalPciBus4Bits;       break;
885                 case 32: bcomp = OpalPciBus3Bits;       break;
886                 default:
887                         dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
888                                 count);
889                         /* Do an exact match only */
890                         bcomp = OpalPciBusAll;
891                 }
892                 rid_end = pe->rid + (count << 8);
893         } else {
894 #ifdef CONFIG_PCI_IOV
895                 if (pe->flags & PNV_IODA_PE_VF)
896                         parent = pe->parent_dev;
897                 else
898 #endif /* CONFIG_PCI_IOV */
899                         parent = pe->pdev->bus->self;
900                 bcomp = OpalPciBusAll;
901                 dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
902                 fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
903                 rid_end = pe->rid + 1;
904         }
905 
906         /*
907          * Associate PE in PELT. We need add the PE into the
908          * corresponding PELT-V as well. Otherwise, the error
909          * originated from the PE might contribute to other
910          * PEs.
911          */
912         rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
913                              bcomp, dcomp, fcomp, OPAL_MAP_PE);
914         if (rc) {
915                 pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
916                 return -ENXIO;
917         }
918 
919         /*
920          * Configure PELTV. NPUs don't have a PELTV table so skip
921          * configuration on them.
922          */
923         if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
924                 pnv_ioda_set_peltv(phb, pe, true);
925 
926         /* Setup reverse map */
927         for (rid = pe->rid; rid < rid_end; rid++)
928                 phb->ioda.pe_rmap[rid] = pe->pe_number;
929 
930         /* Setup one MVTs on IODA1 */
931         if (phb->type != PNV_PHB_IODA1) {
932                 pe->mve_number = 0;
933                 goto out;
934         }
935 
936         pe->mve_number = pe->pe_number;
937         rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
938         if (rc != OPAL_SUCCESS) {
939                 pe_err(pe, "OPAL error %ld setting up MVE %x\n",
940                        rc, pe->mve_number);
941                 pe->mve_number = -1;
942         } else {
943                 rc = opal_pci_set_mve_enable(phb->opal_id,
944                                              pe->mve_number, OPAL_ENABLE_MVE);
945                 if (rc) {
946                         pe_err(pe, "OPAL error %ld enabling MVE %x\n",
947                                rc, pe->mve_number);
948                         pe->mve_number = -1;
949                 }
950         }
951 
952 out:
953         return 0;
954 }
955 
956 #ifdef CONFIG_PCI_IOV
957 static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
958 {
959         struct pci_dn *pdn = pci_get_pdn(dev);
960         int i;
961         struct resource *res, res2;
962         resource_size_t size;
963         u16 num_vfs;
964 
965         if (!dev->is_physfn)
966                 return -EINVAL;
967 
968         /*
969          * "offset" is in VFs.  The M64 windows are sized so that when they
970          * are segmented, each segment is the same size as the IOV BAR.
971          * Each segment is in a separate PE, and the high order bits of the
972          * address are the PE number.  Therefore, each VF's BAR is in a
973          * separate PE, and changing the IOV BAR start address changes the
974          * range of PEs the VFs are in.
975          */
976         num_vfs = pdn->num_vfs;
977         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
978                 res = &dev->resource[i + PCI_IOV_RESOURCES];
979                 if (!res->flags || !res->parent)
980                         continue;
981 
982                 /*
983                  * The actual IOV BAR range is determined by the start address
984                  * and the actual size for num_vfs VFs BAR.  This check is to
985                  * make sure that after shifting, the range will not overlap
986                  * with another device.
987                  */
988                 size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
989                 res2.flags = res->flags;
990                 res2.start = res->start + (size * offset);
991                 res2.end = res2.start + (size * num_vfs) - 1;
992 
993                 if (res2.end > res->end) {
994                         dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
995                                 i, &res2, res, num_vfs, offset);
996                         return -EBUSY;
997                 }
998         }
999 
1000         /*
1001          * Since M64 BAR shares segments among all possible 256 PEs,
1002          * we have to shift the beginning of PF IOV BAR to make it start from
1003          * the segment which belongs to the PE number assigned to the first VF.
1004          * This creates a "hole" in the /proc/iomem which could be used for
1005          * allocating other resources so we reserve this area below and
1006          * release when IOV is released.
1007          */
1008         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1009                 res = &dev->resource[i + PCI_IOV_RESOURCES];
1010                 if (!res->flags || !res->parent)
1011                         continue;
1012 
1013                 size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
1014                 res2 = *res;
1015                 res->start += size * offset;
1016 
1017                 dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
1018                          i, &res2, res, (offset > 0) ? "En" : "Dis",
1019                          num_vfs, offset);
1020 
1021                 if (offset < 0) {
1022                         devm_release_resource(&dev->dev, &pdn->holes[i]);
1023                         memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
1024                 }
1025 
1026                 pci_update_resource(dev, i + PCI_IOV_RESOURCES);
1027 
1028                 if (offset > 0) {
1029                         pdn->holes[i].start = res2.start;
1030                         pdn->holes[i].end = res2.start + size * offset - 1;
1031                         pdn->holes[i].flags = IORESOURCE_BUS;
1032                         pdn->holes[i].name = "pnv_iov_reserved";
1033                         devm_request_resource(&dev->dev, res->parent,
1034                                         &pdn->holes[i]);
1035                 }
1036         }
1037         return 0;
1038 }
1039 #endif /* CONFIG_PCI_IOV */
1040 
1041 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
1042 {
1043         struct pci_controller *hose = pci_bus_to_host(dev->bus);
1044         struct pnv_phb *phb = hose->private_data;
1045         struct pci_dn *pdn = pci_get_pdn(dev);
1046         struct pnv_ioda_pe *pe;
1047 
1048         if (!pdn) {
1049                 pr_err("%s: Device tree node not associated properly\n",
1050                            pci_name(dev));
1051                 return NULL;
1052         }
1053         if (pdn->pe_number != IODA_INVALID_PE)
1054                 return NULL;
1055 
1056         pe = pnv_ioda_alloc_pe(phb);
1057         if (!pe) {
1058                 pr_warn("%s: Not enough PE# available, disabling device\n",
1059                         pci_name(dev));
1060                 return NULL;
1061         }
1062 
1063         /* NOTE: We get only one ref to the pci_dev for the pdn, not for the
1064          * pointer in the PE data structure, both should be destroyed at the
1065          * same time. However, this needs to be looked at more closely again
1066          * once we actually start removing things (Hotplug, SR-IOV, ...)
1067          *
1068          * At some point we want to remove the PDN completely anyways
1069          */
1070         pci_dev_get(dev);
1071         pdn->pe_number = pe->pe_number;
1072         pe->flags = PNV_IODA_PE_DEV;
1073         pe->pdev = dev;
1074         pe->pbus = NULL;
1075         pe->mve_number = -1;
1076         pe->rid = dev->bus->number << 8 | pdn->devfn;
1077 
1078         pe_info(pe, "Associated device to PE\n");
1079 
1080         if (pnv_ioda_configure_pe(phb, pe)) {
1081                 /* XXX What do we do here ? */
1082                 pnv_ioda_free_pe(pe);
1083                 pdn->pe_number = IODA_INVALID_PE;
1084                 pe->pdev = NULL;
1085                 pci_dev_put(dev);
1086                 return NULL;
1087         }
1088 
1089         /* Put PE to the list */
1090         list_add_tail(&pe->list, &phb->ioda.pe_list);
1091 
1092         return pe;
1093 }
1094 
1095 static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
1096 {
1097         struct pci_dev *dev;
1098 
1099         list_for_each_entry(dev, &bus->devices, bus_list) {
1100                 struct pci_dn *pdn = pci_get_pdn(dev);
1101 
1102                 if (pdn == NULL) {
1103                         pr_warn("%s: No device node associated with device !\n",
1104                                 pci_name(dev));
1105                         continue;
1106                 }
1107 
1108                 /*
1109                  * In partial hotplug case, the PCI device might be still
1110                  * associated with the PE and needn't attach it to the PE
1111                  * again.
1112                  */
1113                 if (pdn->pe_number != IODA_INVALID_PE)
1114                         continue;
1115 
1116                 pe->device_count++;
1117                 pdn->pe_number = pe->pe_number;
1118                 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1119                         pnv_ioda_setup_same_PE(dev->subordinate, pe);
1120         }
1121 }
1122 
1123 /*
1124  * There're 2 types of PCI bus sensitive PEs: One that is compromised of
1125  * single PCI bus. Another one that contains the primary PCI bus and its
1126  * subordinate PCI devices and buses. The second type of PE is normally
1127  * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
1128  */
1129 static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
1130 {
1131         struct pci_controller *hose = pci_bus_to_host(bus);
1132         struct pnv_phb *phb = hose->private_data;
1133         struct pnv_ioda_pe *pe = NULL;
1134         unsigned int pe_num;
1135 
1136         /*
1137          * In partial hotplug case, the PE instance might be still alive.
1138          * We should reuse it instead of allocating a new one.
1139          */
1140         pe_num = phb->ioda.pe_rmap[bus->number << 8];
1141         if (pe_num != IODA_INVALID_PE) {
1142                 pe = &phb->ioda.pe_array[pe_num];
1143                 pnv_ioda_setup_same_PE(bus, pe);
1144                 return NULL;
1145         }
1146 
1147         /* PE number for root bus should have been reserved */
1148         if (pci_is_root_bus(bus) &&
1149             phb->ioda.root_pe_idx != IODA_INVALID_PE)
1150                 pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
1151 
1152         /* Check if PE is determined by M64 */
1153         if (!pe)
1154                 pe = pnv_ioda_pick_m64_pe(bus, all);
1155 
1156         /* The PE number isn't pinned by M64 */
1157         if (!pe)
1158                 pe = pnv_ioda_alloc_pe(phb);
1159 
1160         if (!pe) {
1161                 pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
1162                         __func__, pci_domain_nr(bus), bus->number);
1163                 return NULL;
1164         }
1165 
1166         pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
1167         pe->pbus = bus;
1168         pe->pdev = NULL;
1169         pe->mve_number = -1;
1170         pe->rid = bus->busn_res.start << 8;
1171 
1172         if (all)
1173                 pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n",
1174                         &bus->busn_res.start, &bus->busn_res.end,
1175                         pe->pe_number);
1176         else
1177                 pe_info(pe, "Secondary bus %pad associated with PE#%x\n",
1178                         &bus->busn_res.start, pe->pe_number);
1179 
1180         if (pnv_ioda_configure_pe(phb, pe)) {
1181                 /* XXX What do we do here ? */
1182                 pnv_ioda_free_pe(pe);
1183                 pe->pbus = NULL;
1184                 return NULL;
1185         }
1186 
1187         /* Associate it with all child devices */
1188         pnv_ioda_setup_same_PE(bus, pe);
1189 
1190         /* Put PE to the list */
1191         list_add_tail(&pe->list, &phb->ioda.pe_list);
1192 
1193         return pe;
1194 }
1195 
1196 static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
1197 {
1198         int pe_num, found_pe = false, rc;
1199         long rid;
1200         struct pnv_ioda_pe *pe;
1201         struct pci_dev *gpu_pdev;
1202         struct pci_dn *npu_pdn;
1203         struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
1204         struct pnv_phb *phb = hose->private_data;
1205 
1206         /*
1207          * Due to a hardware errata PE#0 on the NPU is reserved for
1208          * error handling. This means we only have three PEs remaining
1209          * which need to be assigned to four links, implying some
1210          * links must share PEs.
1211          *
1212          * To achieve this we assign PEs such that NPUs linking the
1213          * same GPU get assigned the same PE.
1214          */
1215         gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
1216         for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
1217                 pe = &phb->ioda.pe_array[pe_num];
1218                 if (!pe->pdev)
1219                         continue;
1220 
1221                 if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
1222                         /*
1223                          * This device has the same peer GPU so should
1224                          * be assigned the same PE as the existing
1225                          * peer NPU.
1226                          */
1227                         dev_info(&npu_pdev->dev,
1228                                 "Associating to existing PE %x\n", pe_num);
1229                         pci_dev_get(npu_pdev);
1230                         npu_pdn = pci_get_pdn(npu_pdev);
1231                         rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
1232                         npu_pdn->pe_number = pe_num;
1233                         phb->ioda.pe_rmap[rid] = pe->pe_number;
1234 
1235                         /* Map the PE to this link */
1236                         rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
1237                                         OpalPciBusAll,
1238                                         OPAL_COMPARE_RID_DEVICE_NUMBER,
1239                                         OPAL_COMPARE_RID_FUNCTION_NUMBER,
1240                                         OPAL_MAP_PE);
1241                         WARN_ON(rc != OPAL_SUCCESS);
1242                         found_pe = true;
1243                         break;
1244                 }
1245         }
1246 
1247         if (!found_pe)
1248                 /*
1249                  * Could not find an existing PE so allocate a new
1250                  * one.
1251                  */
1252                 return pnv_ioda_setup_dev_PE(npu_pdev);
1253         else
1254                 return pe;
1255 }
1256 
1257 static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
1258 {
1259         struct pci_dev *pdev;
1260 
1261         list_for_each_entry(pdev, &bus->devices, bus_list)
1262                 pnv_ioda_setup_npu_PE(pdev);
1263 }
1264 
1265 static void pnv_pci_ioda_setup_PEs(void)
1266 {
1267         struct pci_controller *hose;
1268         struct pnv_phb *phb;
1269         struct pci_bus *bus;
1270         struct pci_dev *pdev;
1271         struct pnv_ioda_pe *pe;
1272 
1273         list_for_each_entry(hose, &hose_list, list_node) {
1274                 phb = hose->private_data;
1275                 if (phb->type == PNV_PHB_NPU_NVLINK) {
1276                         /* PE#0 is needed for error reporting */
1277                         pnv_ioda_reserve_pe(phb, 0);
1278                         pnv_ioda_setup_npu_PEs(hose->bus);
1279                         if (phb->model == PNV_PHB_MODEL_NPU2)
1280                                 WARN_ON_ONCE(pnv_npu2_init(hose));
1281                 }
1282                 if (phb->type == PNV_PHB_NPU_OCAPI) {
1283                         bus = hose->bus;
1284                         list_for_each_entry(pdev, &bus->devices, bus_list)
1285                                 pnv_ioda_setup_dev_PE(pdev);
1286                 }
1287         }
1288         list_for_each_entry(hose, &hose_list, list_node) {
1289                 phb = hose->private_data;
1290                 if (phb->type != PNV_PHB_IODA2)
1291                         continue;
1292 
1293                 list_for_each_entry(pe, &phb->ioda.pe_list, list)
1294                         pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
1295         }
1296 }
1297 
1298 #ifdef CONFIG_PCI_IOV
1299 static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
1300 {
1301         struct pci_bus        *bus;
1302         struct pci_controller *hose;
1303         struct pnv_phb        *phb;
1304         struct pci_dn         *pdn;
1305         int                    i, j;
1306         int                    m64_bars;
1307 
1308         bus = pdev->bus;
1309         hose = pci_bus_to_host(bus);
1310         phb = hose->private_data;
1311         pdn = pci_get_pdn(pdev);
1312 
1313         if (pdn->m64_single_mode)
1314                 m64_bars = num_vfs;
1315         else
1316                 m64_bars = 1;
1317 
1318         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
1319                 for (j = 0; j < m64_bars; j++) {
1320                         if (pdn->m64_map[j][i] == IODA_INVALID_M64)
1321                                 continue;
1322                         opal_pci_phb_mmio_enable(phb->opal_id,
1323                                 OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
1324                         clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
1325                         pdn->m64_map[j][i] = IODA_INVALID_M64;
1326                 }
1327 
1328         kfree(pdn->m64_map);
1329         return 0;
1330 }
1331 
1332 static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
1333 {
1334         struct pci_bus        *bus;
1335         struct pci_controller *hose;
1336         struct pnv_phb        *phb;
1337         struct pci_dn         *pdn;
1338         unsigned int           win;
1339         struct resource       *res;
1340         int                    i, j;
1341         int64_t                rc;
1342         int                    total_vfs;
1343         resource_size_t        size, start;
1344         int                    pe_num;
1345         int                    m64_bars;
1346 
1347         bus = pdev->bus;
1348         hose = pci_bus_to_host(bus);
1349         phb = hose->private_data;
1350         pdn = pci_get_pdn(pdev);
1351         total_vfs = pci_sriov_get_totalvfs(pdev);
1352 
1353         if (pdn->m64_single_mode)
1354                 m64_bars = num_vfs;
1355         else
1356                 m64_bars = 1;
1357 
1358         pdn->m64_map = kmalloc_array(m64_bars,
1359                                      sizeof(*pdn->m64_map),
1360                                      GFP_KERNEL);
1361         if (!pdn->m64_map)
1362                 return -ENOMEM;
1363         /* Initialize the m64_map to IODA_INVALID_M64 */
1364         for (i = 0; i < m64_bars ; i++)
1365                 for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
1366                         pdn->m64_map[i][j] = IODA_INVALID_M64;
1367 
1368 
1369         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
1370                 res = &pdev->resource[i + PCI_IOV_RESOURCES];
1371                 if (!res->flags || !res->parent)
1372                         continue;
1373 
1374                 for (j = 0; j < m64_bars; j++) {
1375                         do {
1376                                 win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
1377                                                 phb->ioda.m64_bar_idx + 1, 0);
1378 
1379                                 if (win >= phb->ioda.m64_bar_idx + 1)
1380                                         goto m64_failed;
1381                         } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
1382 
1383                         pdn->m64_map[j][i] = win;
1384 
1385                         if (pdn->m64_single_mode) {
1386                                 size = pci_iov_resource_size(pdev,
1387                                                         PCI_IOV_RESOURCES + i);
1388                                 start = res->start + size * j;
1389                         } else {
1390                                 size = resource_size(res);
1391                                 start = res->start;
1392                         }
1393 
1394                         /* Map the M64 here */
1395                         if (pdn->m64_single_mode) {
1396                                 pe_num = pdn->pe_num_map[j];
1397                                 rc = opal_pci_map_pe_mmio_window(phb->opal_id,
1398                                                 pe_num, OPAL_M64_WINDOW_TYPE,
1399                                                 pdn->m64_map[j][i], 0);
1400                         }
1401 
1402                         rc = opal_pci_set_phb_mem_window(phb->opal_id,
1403                                                  OPAL_M64_WINDOW_TYPE,
1404                                                  pdn->m64_map[j][i],
1405                                                  start,
1406                                                  0, /* unused */
1407                                                  size);
1408 
1409 
1410                         if (rc != OPAL_SUCCESS) {
1411                                 dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
1412                                         win, rc);
1413                                 goto m64_failed;
1414                         }
1415 
1416                         if (pdn->m64_single_mode)
1417                                 rc = opal_pci_phb_mmio_enable(phb->opal_id,
1418                                      OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
1419                         else
1420                                 rc = opal_pci_phb_mmio_enable(phb->opal_id,
1421                                      OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
1422 
1423                         if (rc != OPAL_SUCCESS) {
1424                                 dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
1425                                         win, rc);
1426                                 goto m64_failed;
1427                         }
1428                 }
1429         }
1430         return 0;
1431 
1432 m64_failed:
1433         pnv_pci_vf_release_m64(pdev, num_vfs);
1434         return -EBUSY;
1435 }
1436 
1437 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
1438                 int num);
1439 
1440 static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
1441 {
1442         struct iommu_table    *tbl;
1443         int64_t               rc;
1444 
1445         tbl = pe->table_group.tables[0];
1446         rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
1447         if (rc)
1448                 pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
1449 
1450         pnv_pci_ioda2_set_bypass(pe, false);
1451         if (pe->table_group.group) {
1452                 iommu_group_put(pe->table_group.group);
1453                 BUG_ON(pe->table_group.group);
1454         }
1455         iommu_tce_table_put(tbl);
1456 }
1457 
1458 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
1459 {
1460         struct pci_bus        *bus;
1461         struct pci_controller *hose;
1462         struct pnv_phb        *phb;
1463         struct pnv_ioda_pe    *pe, *pe_n;
1464         struct pci_dn         *pdn;
1465 
1466         bus = pdev->bus;
1467         hose = pci_bus_to_host(bus);
1468         phb = hose->private_data;
1469         pdn = pci_get_pdn(pdev);
1470 
1471         if (!pdev->is_physfn)
1472                 return;
1473 
1474         list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
1475                 if (pe->parent_dev != pdev)
1476                         continue;
1477 
1478                 pnv_pci_ioda2_release_dma_pe(pdev, pe);
1479 
1480                 /* Remove from list */
1481                 mutex_lock(&phb->ioda.pe_list_mutex);
1482                 list_del(&pe->list);
1483                 mutex_unlock(&phb->ioda.pe_list_mutex);
1484 
1485                 pnv_ioda_deconfigure_pe(phb, pe);
1486 
1487                 pnv_ioda_free_pe(pe);
1488         }
1489 }
1490 
1491 void pnv_pci_sriov_disable(struct pci_dev *pdev)
1492 {
1493         struct pci_bus        *bus;
1494         struct pci_controller *hose;
1495         struct pnv_phb        *phb;
1496         struct pnv_ioda_pe    *pe;
1497         struct pci_dn         *pdn;
1498         u16                    num_vfs, i;
1499 
1500         bus = pdev->bus;
1501         hose = pci_bus_to_host(bus);
1502         phb = hose->private_data;
1503         pdn = pci_get_pdn(pdev);
1504         num_vfs = pdn->num_vfs;
1505 
1506         /* Release VF PEs */
1507         pnv_ioda_release_vf_PE(pdev);
1508 
1509         if (phb->type == PNV_PHB_IODA2) {
1510                 if (!pdn->m64_single_mode)
1511                         pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
1512 
1513                 /* Release M64 windows */
1514                 pnv_pci_vf_release_m64(pdev, num_vfs);
1515 
1516                 /* Release PE numbers */
1517                 if (pdn->m64_single_mode) {
1518                         for (i = 0; i < num_vfs; i++) {
1519                                 if (pdn->pe_num_map[i] == IODA_INVALID_PE)
1520                                         continue;
1521 
1522                                 pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
1523                                 pnv_ioda_free_pe(pe);
1524                         }
1525                 } else
1526                         bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1527                 /* Releasing pe_num_map */
1528                 kfree(pdn->pe_num_map);
1529         }
1530 }
1531 
1532 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
1533                                        struct pnv_ioda_pe *pe);
1534 #ifdef CONFIG_IOMMU_API
1535 static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
1536                 struct iommu_table_group *table_group, struct pci_bus *bus);
1537 
1538 #endif
1539 static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
1540 {
1541         struct pci_bus        *bus;
1542         struct pci_controller *hose;
1543         struct pnv_phb        *phb;
1544         struct pnv_ioda_pe    *pe;
1545         int                    pe_num;
1546         u16                    vf_index;
1547         struct pci_dn         *pdn;
1548 
1549         bus = pdev->bus;
1550         hose = pci_bus_to_host(bus);
1551         phb = hose->private_data;
1552         pdn = pci_get_pdn(pdev);
1553 
1554         if (!pdev->is_physfn)
1555                 return;
1556 
1557         /* Reserve PE for each VF */
1558         for (vf_index = 0; vf_index < num_vfs; vf_index++) {
1559                 if (pdn->m64_single_mode)
1560                         pe_num = pdn->pe_num_map[vf_index];
1561                 else
1562                         pe_num = *pdn->pe_num_map + vf_index;
1563 
1564                 pe = &phb->ioda.pe_array[pe_num];
1565                 pe->pe_number = pe_num;
1566                 pe->phb = phb;
1567                 pe->flags = PNV_IODA_PE_VF;
1568                 pe->pbus = NULL;
1569                 pe->parent_dev = pdev;
1570                 pe->mve_number = -1;
1571                 pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
1572                            pci_iov_virtfn_devfn(pdev, vf_index);
1573 
1574                 pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
1575                         hose->global_number, pdev->bus->number,
1576                         PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
1577                         PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
1578 
1579                 if (pnv_ioda_configure_pe(phb, pe)) {
1580                         /* XXX What do we do here ? */
1581                         pnv_ioda_free_pe(pe);
1582                         pe->pdev = NULL;
1583                         continue;
1584                 }
1585 
1586                 /* Put PE to the list */
1587                 mutex_lock(&phb->ioda.pe_list_mutex);
1588                 list_add_tail(&pe->list, &phb->ioda.pe_list);
1589                 mutex_unlock(&phb->ioda.pe_list_mutex);
1590 
1591                 pnv_pci_ioda2_setup_dma_pe(phb, pe);
1592 #ifdef CONFIG_IOMMU_API
1593                 iommu_register_group(&pe->table_group,
1594                                 pe->phb->hose->global_number, pe->pe_number);
1595                 pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL);
1596 #endif
1597         }
1598 }
1599 
1600 int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1601 {
1602         struct pci_bus        *bus;
1603         struct pci_controller *hose;
1604         struct pnv_phb        *phb;
1605         struct pnv_ioda_pe    *pe;
1606         struct pci_dn         *pdn;
1607         int                    ret;
1608         u16                    i;
1609 
1610         bus = pdev->bus;
1611         hose = pci_bus_to_host(bus);
1612         phb = hose->private_data;
1613         pdn = pci_get_pdn(pdev);
1614 
1615         if (phb->type == PNV_PHB_IODA2) {
1616                 if (!pdn->vfs_expanded) {
1617                         dev_info(&pdev->dev, "don't support this SRIOV device"
1618                                 " with non 64bit-prefetchable IOV BAR\n");
1619                         return -ENOSPC;
1620                 }
1621 
1622                 /*
1623                  * When M64 BARs functions in Single PE mode, the number of VFs
1624                  * could be enabled must be less than the number of M64 BARs.
1625                  */
1626                 if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
1627                         dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
1628                         return -EBUSY;
1629                 }
1630 
1631                 /* Allocating pe_num_map */
1632                 if (pdn->m64_single_mode)
1633                         pdn->pe_num_map = kmalloc_array(num_vfs,
1634                                                         sizeof(*pdn->pe_num_map),
1635                                                         GFP_KERNEL);
1636                 else
1637                         pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
1638 
1639                 if (!pdn->pe_num_map)
1640                         return -ENOMEM;
1641 
1642                 if (pdn->m64_single_mode)
1643                         for (i = 0; i < num_vfs; i++)
1644                                 pdn->pe_num_map[i] = IODA_INVALID_PE;
1645 
1646                 /* Calculate available PE for required VFs */
1647                 if (pdn->m64_single_mode) {
1648                         for (i = 0; i < num_vfs; i++) {
1649                                 pe = pnv_ioda_alloc_pe(phb);
1650                                 if (!pe) {
1651                                         ret = -EBUSY;
1652                                         goto m64_failed;
1653                                 }
1654 
1655                                 pdn->pe_num_map[i] = pe->pe_number;
1656                         }
1657                 } else {
1658                         mutex_lock(&phb->ioda.pe_alloc_mutex);
1659                         *pdn->pe_num_map = bitmap_find_next_zero_area(
1660                                 phb->ioda.pe_alloc, phb->ioda.total_pe_num,
1661                                 0, num_vfs, 0);
1662                         if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
1663                                 mutex_unlock(&phb->ioda.pe_alloc_mutex);
1664                                 dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
1665                                 kfree(pdn->pe_num_map);
1666                                 return -EBUSY;
1667                         }
1668                         bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1669                         mutex_unlock(&phb->ioda.pe_alloc_mutex);
1670                 }
1671                 pdn->num_vfs = num_vfs;
1672 
1673                 /* Assign M64 window accordingly */
1674                 ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
1675                 if (ret) {
1676                         dev_info(&pdev->dev, "Not enough M64 window resources\n");
1677                         goto m64_failed;
1678                 }
1679 
1680                 /*
1681                  * When using one M64 BAR to map one IOV BAR, we need to shift
1682                  * the IOV BAR according to the PE# allocated to the VFs.
1683                  * Otherwise, the PE# for the VF will conflict with others.
1684                  */
1685                 if (!pdn->m64_single_mode) {
1686                         ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
1687                         if (ret)
1688                                 goto m64_failed;
1689                 }
1690         }
1691 
1692         /* Setup VF PEs */
1693         pnv_ioda_setup_vf_PE(pdev, num_vfs);
1694 
1695         return 0;
1696 
1697 m64_failed:
1698         if (pdn->m64_single_mode) {
1699                 for (i = 0; i < num_vfs; i++) {
1700                         if (pdn->pe_num_map[i] == IODA_INVALID_PE)
1701                                 continue;
1702 
1703                         pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
1704                         pnv_ioda_free_pe(pe);
1705                 }
1706         } else
1707                 bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
1708 
1709         /* Releasing pe_num_map */
1710         kfree(pdn->pe_num_map);
1711 
1712         return ret;
1713 }
1714 
1715 int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
1716 {
1717         pnv_pci_sriov_disable(pdev);
1718 
1719         /* Release PCI data */
1720         remove_dev_pci_data(pdev);
1721         return 0;
1722 }
1723 
1724 int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
1725 {
1726         /* Allocate PCI data */
1727         add_dev_pci_data(pdev);
1728 
1729         return pnv_pci_sriov_enable(pdev, num_vfs);
1730 }
1731 #endif /* CONFIG_PCI_IOV */
1732 
1733 static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
1734 {
1735         struct pci_dn *pdn = pci_get_pdn(pdev);
1736         struct pnv_ioda_pe *pe;
1737 
1738         /*
1739          * The function can be called while the PE#
1740          * hasn't been assigned. Do nothing for the
1741          * case.
1742          */
1743         if (!pdn || pdn->pe_number == IODA_INVALID_PE)
1744                 return;
1745 
1746         pe = &phb->ioda.pe_array[pdn->pe_number];
1747         WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
1748         pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
1749         set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
1750         /*
1751          * Note: iommu_add_device() will fail here as
1752          * for physical PE: the device is already added by now;
1753          * for virtual PE: sysfs entries are not ready yet and
1754          * tce_iommu_bus_notifier will add the device to a group later.
1755          */
1756 }
1757 
1758 /*
1759  * Reconfigure TVE#0 to be usable as 64-bit DMA space.
1760  *
1761  * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
1762  * Devices can only access more than that if bit 59 of the PCI address is set
1763  * by hardware, which indicates TVE#1 should be used instead of TVE#0.
1764  * Many PCI devices are not capable of addressing that many bits, and as a
1765  * result are limited to the 4GB of virtual memory made available to 32-bit
1766  * devices in TVE#0.
1767  *
1768  * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
1769  * devices by configuring the virtual memory past the first 4GB inaccessible
1770  * by 64-bit DMAs.  This should only be used by devices that want more than
1771  * 4GB, and only on PEs that have no 32-bit devices.
1772  *
1773  * Currently this will only work on PHB3 (POWER8).
1774  */
1775 static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
1776 {
1777         u64 window_size, table_size, tce_count, addr;
1778         struct page *table_pages;
1779         u64 tce_order = 28; /* 256MB TCEs */
1780         __be64 *tces;
1781         s64 rc;
1782 
1783         /*
1784          * Window size needs to be a power of two, but needs to account for
1785          * shifting memory by the 4GB offset required to skip 32bit space.
1786          */
1787         window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
1788         tce_count = window_size >> tce_order;
1789         table_size = tce_count << 3;
1790 
1791         if (table_size < PAGE_SIZE)
1792                 table_size = PAGE_SIZE;
1793 
1794         table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
1795                                        get_order(table_size));
1796         if (!table_pages)
1797                 goto err;
1798 
1799         tces = page_address(table_pages);
1800         if (!tces)
1801                 goto err;
1802 
1803         memset(tces, 0, table_size);
1804 
1805         for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
1806                 tces[(addr + (1ULL << 32)) >> tce_order] =
1807                         cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
1808         }
1809 
1810         rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
1811                                         pe->pe_number,
1812                                         /* reconfigure window 0 */
1813                                         (pe->pe_number << 1) + 0,
1814                                         1,
1815                                         __pa(tces),
1816                                         table_size,
1817                                         1 << tce_order);
1818         if (rc == OPAL_SUCCESS) {
1819                 pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
1820                 return 0;
1821         }
1822 err:
1823         pe_err(pe, "Error configuring 64-bit DMA bypass\n");
1824         return -EIO;
1825 }
1826 
1827 static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
1828                 u64 dma_mask)
1829 {
1830         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
1831         struct pnv_phb *phb = hose->private_data;
1832         struct pci_dn *pdn = pci_get_pdn(pdev);
1833         struct pnv_ioda_pe *pe;
1834 
1835         if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
1836                 return false;
1837 
1838         pe = &phb->ioda.pe_array[pdn->pe_number];
1839         if (pe->tce_bypass_enabled) {
1840                 u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
1841                 if (dma_mask >= top)
1842                         return true;
1843         }
1844 
1845         /*
1846          * If the device can't set the TCE bypass bit but still wants
1847          * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
1848          * bypass the 32-bit region and be usable for 64-bit DMAs.
1849          * The device needs to be able to address all of this space.
1850          */
1851         if (dma_mask >> 32 &&
1852             dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
1853             /* pe->pdev should be set if it's a single device, pe->pbus if not */
1854             (pe->device_count == 1 || !pe->pbus) &&
1855             phb->model == PNV_PHB_MODEL_PHB3) {
1856                 /* Configure the bypass mode */
1857                 s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
1858                 if (rc)
1859                         return false;
1860                 /* 4GB offset bypasses 32-bit space */
1861                 pdev->dev.archdata.dma_offset = (1ULL << 32);
1862                 return true;
1863         }
1864 
1865         return false;
1866 }
1867 
1868 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
1869 {
1870         struct pci_dev *dev;
1871 
1872         list_for_each_entry(dev, &bus->devices, bus_list) {
1873                 set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
1874                 dev->dev.archdata.dma_offset = pe->tce_bypass_base;
1875 
1876                 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
1877                         pnv_ioda_setup_bus_dma(pe, dev->subordinate);
1878         }
1879 }
1880 
1881 static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
1882                                                      bool real_mode)
1883 {
1884         return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) :
1885                 (phb->regs + 0x210);
1886 }
1887 
1888 static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
1889                 unsigned long index, unsigned long npages, bool rm)
1890 {
1891         struct iommu_table_group_link *tgl = list_first_entry_or_null(
1892                         &tbl->it_group_list, struct iommu_table_group_link,
1893                         next);
1894         struct pnv_ioda_pe *pe = container_of(tgl->table_group,
1895                         struct pnv_ioda_pe, table_group);
1896         __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
1897         unsigned long start, end, inc;
1898 
1899         start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
1900         end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
1901                         npages - 1);
1902 
1903         /* p7ioc-style invalidation, 2 TCEs per write */
1904         start |= (1ull << 63);
1905         end |= (1ull << 63);
1906         inc = 16;
1907         end |= inc - 1; /* round up end to be different than start */
1908 
1909         mb(); /* Ensure above stores are visible */
1910         while (start <= end) {
1911                 if (rm)
1912                         __raw_rm_writeq_be(start, invalidate);
1913                 else
1914                         __raw_writeq_be(start, invalidate);
1915 
1916                 start += inc;
1917         }
1918 
1919         /*
1920          * The iommu layer will do another mb() for us on build()
1921          * and we don't care on free()
1922          */
1923 }
1924 
1925 static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
1926                 long npages, unsigned long uaddr,
1927                 enum dma_data_direction direction,
1928                 unsigned long attrs)
1929 {
1930         int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
1931                         attrs);
1932 
1933         if (!ret)
1934                 pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
1935 
1936         return ret;
1937 }
1938 
1939 #ifdef CONFIG_IOMMU_API
1940 static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
1941                 unsigned long *hpa, enum dma_data_direction *direction)
1942 {
1943         long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
1944 
1945         if (!ret)
1946                 pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
1947 
1948         return ret;
1949 }
1950 
1951 static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
1952                 unsigned long *hpa, enum dma_data_direction *direction)
1953 {
1954         long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
1955 
1956         if (!ret)
1957                 pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
1958 
1959         return ret;
1960 }
1961 #endif
1962 
1963 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
1964                 long npages)
1965 {
1966         pnv_tce_free(tbl, index, npages);
1967 
1968         pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
1969 }
1970 
1971 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
1972         .set = pnv_ioda1_tce_build,
1973 #ifdef CONFIG_IOMMU_API
1974         .exchange = pnv_ioda1_tce_xchg,
1975         .exchange_rm = pnv_ioda1_tce_xchg_rm,
1976         .useraddrptr = pnv_tce_useraddrptr,
1977 #endif
1978         .clear = pnv_ioda1_tce_free,
1979         .get = pnv_tce_get,
1980 };
1981 
1982 #define PHB3_TCE_KILL_INVAL_ALL         PPC_BIT(0)
1983 #define PHB3_TCE_KILL_INVAL_PE          PPC_BIT(1)
1984 #define PHB3_TCE_KILL_INVAL_ONE         PPC_BIT(2)
1985 
1986 static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
1987 {
1988         __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
1989         const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
1990 
1991         mb(); /* Ensure previous TCE table stores are visible */
1992         if (rm)
1993                 __raw_rm_writeq_be(val, invalidate);
1994         else
1995                 __raw_writeq_be(val, invalidate);
1996 }
1997 
1998 static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
1999 {
2000         /* 01xb - invalidate TCEs that match the specified PE# */
2001         __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
2002         unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
2003 
2004         mb(); /* Ensure above stores are visible */
2005         __raw_writeq_be(val, invalidate);
2006 }
2007 
2008 static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
2009                                         unsigned shift, unsigned long index,
2010                                         unsigned long npages)
2011 {
2012         __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
2013         unsigned long start, end, inc;
2014 
2015         /* We'll invalidate DMA address in PE scope */
2016         start = PHB3_TCE_KILL_INVAL_ONE;
2017         start |= (pe->pe_number & 0xFF);
2018         end = start;
2019 
2020         /* Figure out the start, end and step */
2021         start |= (index << shift);
2022         end |= ((index + npages - 1) << shift);
2023         inc = (0x1ull << shift);
2024         mb();
2025 
2026         while (start <= end) {
2027                 if (rm)
2028                         __raw_rm_writeq_be(start, invalidate);
2029                 else
2030                         __raw_writeq_be(start, invalidate);
2031                 start += inc;
2032         }
2033 }
2034 
2035 static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
2036 {
2037         struct pnv_phb *phb = pe->phb;
2038 
2039         if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
2040                 pnv_pci_phb3_tce_invalidate_pe(pe);
2041         else
2042                 opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL_PE,
2043                                   pe->pe_number, 0, 0, 0);
2044 }
2045 
2046 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
2047                 unsigned long index, unsigned long npages, bool rm)
2048 {
2049         struct iommu_table_group_link *tgl;
2050 
2051         list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
2052                 struct pnv_ioda_pe *pe = container_of(tgl->table_group,
2053                                 struct pnv_ioda_pe, table_group);
2054                 struct pnv_phb *phb = pe->phb;
2055                 unsigned int shift = tbl->it_page_shift;
2056 
2057                 /*
2058                  * NVLink1 can use the TCE kill register directly as
2059                  * it's the same as PHB3. NVLink2 is different and
2060                  * should go via the OPAL call.
2061                  */
2062                 if (phb->model == PNV_PHB_MODEL_NPU) {
2063                         /*
2064                          * The NVLink hardware does not support TCE kill
2065                          * per TCE entry so we have to invalidate
2066                          * the entire cache for it.
2067                          */
2068                         pnv_pci_phb3_tce_invalidate_entire(phb, rm);
2069                         continue;
2070                 }
2071                 if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
2072                         pnv_pci_phb3_tce_invalidate(pe, rm, shift,
2073                                                     index, npages);
2074                 else
2075                         opal_pci_tce_kill(phb->opal_id,
2076                                           OPAL_PCI_TCE_KILL_PAGES,
2077                                           pe->pe_number, 1u << shift,
2078                                           index << shift, npages);
2079         }
2080 }
2081 
2082 void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
2083 {
2084         if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
2085                 pnv_pci_phb3_tce_invalidate_entire(phb, rm);
2086         else
2087                 opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
2088 }
2089 
2090 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
2091                 long npages, unsigned long uaddr,
2092                 enum dma_data_direction direction,
2093                 unsigned long attrs)
2094 {
2095         int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
2096                         attrs);
2097 
2098         if (!ret)
2099                 pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
2100 
2101         return ret;
2102 }
2103 
2104 #ifdef CONFIG_IOMMU_API
2105 static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
2106                 unsigned long *hpa, enum dma_data_direction *direction)
2107 {
2108         long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
2109 
2110         if (!ret)
2111                 pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
2112 
2113         return ret;
2114 }
2115 
2116 static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
2117                 unsigned long *hpa, enum dma_data_direction *direction)
2118 {
2119         long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
2120 
2121         if (!ret)
2122                 pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
2123 
2124         return ret;
2125 }
2126 #endif
2127 
2128 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
2129                 long npages)
2130 {
2131         pnv_tce_free(tbl, index, npages);
2132 
2133         pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
2134 }
2135 
2136 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
2137         .set = pnv_ioda2_tce_build,
2138 #ifdef CONFIG_IOMMU_API
2139         .exchange = pnv_ioda2_tce_xchg,
2140         .exchange_rm = pnv_ioda2_tce_xchg_rm,
2141         .useraddrptr = pnv_tce_useraddrptr,
2142 #endif
2143         .clear = pnv_ioda2_tce_free,
2144         .get = pnv_tce_get,
2145         .free = pnv_pci_ioda2_table_free_pages,
2146 };
2147 
2148 static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
2149 {
2150         unsigned int *weight = (unsigned int *)data;
2151 
2152         /* This is quite simplistic. The "base" weight of a device
2153          * is 10. 0 means no DMA is to be accounted for it.
2154          */
2155         if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
2156                 return 0;
2157 
2158         if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
2159             dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
2160             dev->class == PCI_CLASS_SERIAL_USB_EHCI)
2161                 *weight += 3;
2162         else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
2163                 *weight += 15;
2164         else
2165                 *weight += 10;
2166 
2167         return 0;
2168 }
2169 
2170 static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
2171 {
2172         unsigned int weight = 0;
2173 
2174         /* SRIOV VF has same DMA32 weight as its PF */
2175 #ifdef CONFIG_PCI_IOV
2176         if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
2177                 pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
2178                 return weight;
2179         }
2180 #endif
2181 
2182         if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
2183                 pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
2184         } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
2185                 struct pci_dev *pdev;
2186 
2187                 list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
2188                         pnv_pci_ioda_dev_dma_weight(pdev, &weight);
2189         } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
2190                 pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
2191         }
2192 
2193         return weight;
2194 }
2195 
2196 static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
2197                                        struct pnv_ioda_pe *pe)
2198 {
2199 
2200         struct page *tce_mem = NULL;
2201         struct iommu_table *tbl;
2202         unsigned int weight, total_weight = 0;
2203         unsigned int tce32_segsz, base, segs, avail, i;
2204         int64_t rc;
2205         void *addr;
2206 
2207         /* XXX FIXME: Handle 64-bit only DMA devices */
2208         /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
2209         /* XXX FIXME: Allocate multi-level tables on PHB3 */
2210         weight = pnv_pci_ioda_pe_dma_weight(pe);
2211         if (!weight)
2212                 return;
2213 
2214         pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
2215                      &total_weight);
2216         segs = (weight * phb->ioda.dma32_count) / total_weight;
2217         if (!segs)
2218                 segs = 1;
2219 
2220         /*
2221          * Allocate contiguous DMA32 segments. We begin with the expected
2222          * number of segments. With one more attempt, the number of DMA32
2223          * segments to be allocated is decreased by one until one segment
2224          * is allocated successfully.
2225          */
2226         do {
2227                 for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
2228                         for (avail = 0, i = base; i < base + segs; i++) {
2229                                 if (phb->ioda.dma32_segmap[i] ==
2230                                     IODA_INVALID_PE)
2231                                         avail++;
2232                         }
2233 
2234                         if (avail == segs)
2235                                 goto found;
2236                 }
2237         } while (--segs);
2238 
2239         if (!segs) {
2240                 pe_warn(pe, "No available DMA32 segments\n");
2241                 return;
2242         }
2243 
2244 found:
2245         tbl = pnv_pci_table_alloc(phb->hose->node);
2246         if (WARN_ON(!tbl))
2247                 return;
2248 
2249         iommu_register_group(&pe->table_group, phb->hose->global_number,
2250                         pe->pe_number);
2251         pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
2252 
2253         /* Grab a 32-bit TCE table */
2254         pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
2255                 weight, total_weight, base, segs);
2256         pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
2257                 base * PNV_IODA1_DMA32_SEGSIZE,
2258                 (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
2259 
2260         /* XXX Currently, we allocate one big contiguous table for the
2261          * TCEs. We only really need one chunk per 256M of TCE space
2262          * (ie per segment) but that's an optimization for later, it
2263          * requires some added smarts with our get/put_tce implementation
2264          *
2265          * Each TCE page is 4KB in size and each TCE entry occupies 8
2266          * bytes
2267          */
2268         tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
2269         tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
2270                                    get_order(tce32_segsz * segs));
2271         if (!tce_mem) {
2272                 pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
2273                 goto fail;
2274         }
2275         addr = page_address(tce_mem);
2276         memset(addr, 0, tce32_segsz * segs);
2277 
2278         /* Configure HW */
2279         for (i = 0; i < segs; i++) {
2280                 rc = opal_pci_map_pe_dma_window(phb->opal_id,
2281                                               pe->pe_number,
2282                                               base + i, 1,
2283                                               __pa(addr) + tce32_segsz * i,
2284                                               tce32_segsz, IOMMU_PAGE_SIZE_4K);
2285                 if (rc) {
2286                         pe_err(pe, " Failed to configure 32-bit TCE table, err %lld\n",
2287                                rc);
2288                         goto fail;
2289                 }
2290         }
2291 
2292         /* Setup DMA32 segment mapping */
2293         for (i = base; i < base + segs; i++)
2294                 phb->ioda.dma32_segmap[i] = pe->pe_number;
2295 
2296         /* Setup linux iommu table */
2297         pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
2298                                   base * PNV_IODA1_DMA32_SEGSIZE,
2299                                   IOMMU_PAGE_SHIFT_4K);
2300 
2301         tbl->it_ops = &pnv_ioda1_iommu_ops;
2302         pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
2303         pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
2304         iommu_init_table(tbl, phb->hose->node);
2305 
2306         if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2307                 pnv_ioda_setup_bus_dma(pe, pe->pbus);
2308 
2309         return;
2310  fail:
2311         /* XXX Failure: Try to fallback to 64-bit only ? */
2312         if (tce_mem)
2313                 __free_pages(tce_mem, get_order(tce32_segsz * segs));
2314         if (tbl) {
2315                 pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
2316                 iommu_tce_table_put(tbl);
2317         }
2318 }
2319 
2320 static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
2321                 int num, struct iommu_table *tbl)
2322 {
2323         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2324                         table_group);
2325         struct pnv_phb *phb = pe->phb;
2326         int64_t rc;
2327         const unsigned long size = tbl->it_indirect_levels ?
2328                         tbl->it_level_size : tbl->it_size;
2329         const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
2330         const __u64 win_size = tbl->it_size << tbl->it_page_shift;
2331 
2332         pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n",
2333                 num, start_addr, start_addr + win_size - 1,
2334                 IOMMU_PAGE_SIZE(tbl));
2335 
2336         /*
2337          * Map TCE table through TVT. The TVE index is the PE number
2338          * shifted by 1 bit for 32-bits DMA space.
2339          */
2340         rc = opal_pci_map_pe_dma_window(phb->opal_id,
2341                         pe->pe_number,
2342                         (pe->pe_number << 1) + num,
2343                         tbl->it_indirect_levels + 1,
2344                         __pa(tbl->it_base),
2345                         size << 3,
2346                         IOMMU_PAGE_SIZE(tbl));
2347         if (rc) {
2348                 pe_err(pe, "Failed to configure TCE table, err %lld\n", rc);
2349                 return rc;
2350         }
2351 
2352         pnv_pci_link_table_and_group(phb->hose->node, num,
2353                         tbl, &pe->table_group);
2354         pnv_pci_ioda2_tce_invalidate_pe(pe);
2355 
2356         return 0;
2357 }
2358 
2359 void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
2360 {
2361         uint16_t window_id = (pe->pe_number << 1 ) + 1;
2362         int64_t rc;
2363 
2364         pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
2365         if (enable) {
2366                 phys_addr_t top = memblock_end_of_DRAM();
2367 
2368                 top = roundup_pow_of_two(top);
2369                 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
2370                                                      pe->pe_number,
2371                                                      window_id,
2372                                                      pe->tce_bypass_base,
2373                                                      top);
2374         } else {
2375                 rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
2376                                                      pe->pe_number,
2377                                                      window_id,
2378                                                      pe->tce_bypass_base,
2379                                                      0);
2380         }
2381         if (rc)
2382                 pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
2383         else
2384                 pe->tce_bypass_enabled = enable;
2385 }
2386 
2387 static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
2388                 int num, __u32 page_shift, __u64 window_size, __u32 levels,
2389                 bool alloc_userspace_copy, struct iommu_table **ptbl)
2390 {
2391         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2392                         table_group);
2393         int nid = pe->phb->hose->node;
2394         __u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
2395         long ret;
2396         struct iommu_table *tbl;
2397 
2398         tbl = pnv_pci_table_alloc(nid);
2399         if (!tbl)
2400                 return -ENOMEM;
2401 
2402         tbl->it_ops = &pnv_ioda2_iommu_ops;
2403 
2404         ret = pnv_pci_ioda2_table_alloc_pages(nid,
2405                         bus_offset, page_shift, window_size,
2406                         levels, alloc_userspace_copy, tbl);
2407         if (ret) {
2408                 iommu_tce_table_put(tbl);
2409                 return ret;
2410         }
2411 
2412         *ptbl = tbl;
2413 
2414         return 0;
2415 }
2416 
2417 static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
2418 {
2419         struct iommu_table *tbl = NULL;
2420         long rc;
2421 
2422         /*
2423          * crashkernel= specifies the kdump kernel's maximum memory at
2424          * some offset and there is no guaranteed the result is a power
2425          * of 2, which will cause errors later.
2426          */
2427         const u64 max_memory = __rounddown_pow_of_two(memory_hotplug_max());
2428 
2429         /*
2430          * In memory constrained environments, e.g. kdump kernel, the
2431          * DMA window can be larger than available memory, which will
2432          * cause errors later.
2433          */
2434         const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
2435 
2436         rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
2437                         IOMMU_PAGE_SHIFT_4K,
2438                         window_size,
2439                         POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl);
2440         if (rc) {
2441                 pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
2442                                 rc);
2443                 return rc;
2444         }
2445 
2446         iommu_init_table(tbl, pe->phb->hose->node);
2447 
2448         rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
2449         if (rc) {
2450                 pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
2451                                 rc);
2452                 iommu_tce_table_put(tbl);
2453                 return rc;
2454         }
2455 
2456         if (!pnv_iommu_bypass_disabled)
2457                 pnv_pci_ioda2_set_bypass(pe, true);
2458 
2459         /*
2460          * Set table base for the case of IOMMU DMA use. Usually this is done
2461          * from dma_dev_setup() which is not called when a device is returned
2462          * from VFIO so do it here.
2463          */
2464         if (pe->pdev)
2465                 set_iommu_table_base(&pe->pdev->dev, tbl);
2466 
2467         return 0;
2468 }
2469 
2470 #if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
2471 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
2472                 int num)
2473 {
2474         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2475                         table_group);
2476         struct pnv_phb *phb = pe->phb;
2477         long ret;
2478 
2479         pe_info(pe, "Removing DMA window #%d\n", num);
2480 
2481         ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
2482                         (pe->pe_number << 1) + num,
2483                         0/* levels */, 0/* table address */,
2484                         0/* table size */, 0/* page size */);
2485         if (ret)
2486                 pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
2487         else
2488                 pnv_pci_ioda2_tce_invalidate_pe(pe);
2489 
2490         pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
2491 
2492         return ret;
2493 }
2494 #endif
2495 
2496 #ifdef CONFIG_IOMMU_API
2497 unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
2498                 __u64 window_size, __u32 levels)
2499 {
2500         unsigned long bytes = 0;
2501         const unsigned window_shift = ilog2(window_size);
2502         unsigned entries_shift = window_shift - page_shift;
2503         unsigned table_shift = entries_shift + 3;
2504         unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
2505         unsigned long direct_table_size;
2506 
2507         if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
2508                         !is_power_of_2(window_size))
2509                 return 0;
2510 
2511         /* Calculate a direct table size from window_size and levels */
2512         entries_shift = (entries_shift + levels - 1) / levels;
2513         table_shift = entries_shift + 3;
2514         table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
2515         direct_table_size =  1UL << table_shift;
2516 
2517         for ( ; levels; --levels) {
2518                 bytes += _ALIGN_UP(tce_table_size, direct_table_size);
2519 
2520                 tce_table_size /= direct_table_size;
2521                 tce_table_size <<= 3;
2522                 tce_table_size = max_t(unsigned long,
2523                                 tce_table_size, direct_table_size);
2524         }
2525 
2526         return bytes + bytes; /* one for HW table, one for userspace copy */
2527 }
2528 
2529 static long pnv_pci_ioda2_create_table_userspace(
2530                 struct iommu_table_group *table_group,
2531                 int num, __u32 page_shift, __u64 window_size, __u32 levels,
2532                 struct iommu_table **ptbl)
2533 {
2534         long ret = pnv_pci_ioda2_create_table(table_group,
2535                         num, page_shift, window_size, levels, true, ptbl);
2536 
2537         if (!ret)
2538                 (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
2539                                 page_shift, window_size, levels);
2540         return ret;
2541 }
2542 
2543 static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
2544 {
2545         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2546                                                 table_group);
2547         /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
2548         struct iommu_table *tbl = pe->table_group.tables[0];
2549 
2550         pnv_pci_ioda2_set_bypass(pe, false);
2551         pnv_pci_ioda2_unset_window(&pe->table_group, 0);
2552         if (pe->pbus)
2553                 pnv_ioda_setup_bus_dma(pe, pe->pbus);
2554         else if (pe->pdev)
2555                 set_iommu_table_base(&pe->pdev->dev, NULL);
2556         iommu_tce_table_put(tbl);
2557 }
2558 
2559 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
2560 {
2561         struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
2562                                                 table_group);
2563 
2564         pnv_pci_ioda2_setup_default_config(pe);
2565         if (pe->pbus)
2566                 pnv_ioda_setup_bus_dma(pe, pe->pbus);
2567 }
2568 
2569 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
2570         .get_table_size = pnv_pci_ioda2_get_table_size,
2571         .create_table = pnv_pci_ioda2_create_table_userspace,
2572         .set_window = pnv_pci_ioda2_set_window,
2573         .unset_window = pnv_pci_ioda2_unset_window,
2574         .take_ownership = pnv_ioda2_take_ownership,
2575         .release_ownership = pnv_ioda2_release_ownership,
2576 };
2577 
2578 static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
2579                 struct iommu_table_group *table_group,
2580                 struct pci_bus *bus)
2581 {
2582         struct pci_dev *dev;
2583 
2584         list_for_each_entry(dev, &bus->devices, bus_list) {
2585                 iommu_add_device(table_group, &dev->dev);
2586 
2587                 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
2588                         pnv_ioda_setup_bus_iommu_group_add_devices(pe,
2589                                         table_group, dev->subordinate);
2590         }
2591 }
2592 
2593 static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
2594                 struct iommu_table_group *table_group, struct pci_bus *bus)
2595 {
2596 
2597         if (pe->flags & PNV_IODA_PE_DEV)
2598                 iommu_add_device(table_group, &pe->pdev->dev);
2599 
2600         if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus)
2601                 pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group,
2602                                 bus);
2603 }
2604 
2605 static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
2606 
2607 static void pnv_pci_ioda_setup_iommu_api(void)
2608 {
2609         struct pci_controller *hose;
2610         struct pnv_phb *phb;
2611         struct pnv_ioda_pe *pe;
2612 
2613         /*
2614          * There are 4 types of PEs:
2615          * - PNV_IODA_PE_BUS: a downstream port with an adapter,
2616          *   created from pnv_pci_setup_bridge();
2617          * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it,
2618          *   created from pnv_pci_setup_bridge();
2619          * - PNV_IODA_PE_VF: a SRIOV virtual function,
2620          *   created from pnv_pcibios_sriov_enable();
2621          * - PNV_IODA_PE_DEV: an NPU or OCAPI device,
2622          *   created from pnv_pci_ioda_fixup().
2623          *
2624          * Normally a PE is represented by an IOMMU group, however for
2625          * devices with side channels the groups need to be more strict.
2626          */
2627         list_for_each_entry(hose, &hose_list, list_node) {
2628                 phb = hose->private_data;
2629 
2630                 if (phb->type == PNV_PHB_NPU_NVLINK ||
2631                     phb->type == PNV_PHB_NPU_OCAPI)
2632                         continue;
2633 
2634                 list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2635                         struct iommu_table_group *table_group;
2636 
2637                         table_group = pnv_try_setup_npu_table_group(pe);
2638                         if (!table_group) {
2639                                 if (!pnv_pci_ioda_pe_dma_weight(pe))
2640                                         continue;
2641 
2642                                 table_group = &pe->table_group;
2643                                 iommu_register_group(&pe->table_group,
2644                                                 pe->phb->hose->global_number,
2645                                                 pe->pe_number);
2646                         }
2647                         pnv_ioda_setup_bus_iommu_group(pe, table_group,
2648                                         pe->pbus);
2649                 }
2650         }
2651 
2652         /*
2653          * Now we have all PHBs discovered, time to add NPU devices to
2654          * the corresponding IOMMU groups.
2655          */
2656         list_for_each_entry(hose, &hose_list, list_node) {
2657                 unsigned long  pgsizes;
2658 
2659                 phb = hose->private_data;
2660 
2661                 if (phb->type != PNV_PHB_NPU_NVLINK)
2662                         continue;
2663 
2664                 pgsizes = pnv_ioda_parse_tce_sizes(phb);
2665                 list_for_each_entry(pe, &phb->ioda.pe_list, list) {
2666                         /*
2667                          * IODA2 bridges get this set up from
2668                          * pci_controller_ops::setup_bridge but NPU bridges
2669                          * do not have this hook defined so we do it here.
2670                          */
2671                         pe->table_group.pgsizes = pgsizes;
2672                         pnv_npu_compound_attach(pe);
2673                 }
2674         }
2675 }
2676 #else /* !CONFIG_IOMMU_API */
2677 static void pnv_pci_ioda_setup_iommu_api(void) { };
2678 #endif
2679 
2680 static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
2681 {
2682         struct pci_controller *hose = phb->hose;
2683         struct device_node *dn = hose->dn;
2684         unsigned long mask = 0;
2685         int i, rc, count;
2686         u32 val;
2687 
2688         count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
2689         if (count <= 0) {
2690                 mask = SZ_4K | SZ_64K;
2691                 /* Add 16M for POWER8 by default */
2692                 if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
2693                                 !cpu_has_feature(CPU_FTR_ARCH_300))
2694                         mask |= SZ_16M | SZ_256M;
2695                 return mask;
2696         }
2697 
2698         for (i = 0; i < count; i++) {
2699                 rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
2700                                                 i, &val);
2701                 if (rc == 0)
2702                         mask |= 1ULL << val;
2703         }
2704 
2705         return mask;
2706 }
2707 
2708 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
2709                                        struct pnv_ioda_pe *pe)
2710 {
2711         int64_t rc;
2712 
2713         if (!pnv_pci_ioda_pe_dma_weight(pe))
2714                 return;
2715 
2716         /* TVE #1 is selected by PCI address bit 59 */
2717         pe->tce_bypass_base = 1ull << 59;
2718 
2719         /* The PE will reserve all possible 32-bits space */
2720         pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
2721                 phb->ioda.m32_pci_base);
2722 
2723         /* Setup linux iommu table */
2724         pe->table_group.tce32_start = 0;
2725         pe->table_group.tce32_size = phb->ioda.m32_pci_base;
2726         pe->table_group.max_dynamic_windows_supported =
2727                         IOMMU_TABLE_GROUP_MAX_TABLES;
2728         pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
2729         pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
2730 #ifdef CONFIG_IOMMU_API
2731         pe->table_group.ops = &pnv_pci_ioda2_ops;
2732 #endif
2733 
2734         rc = pnv_pci_ioda2_setup_default_config(pe);
2735         if (rc)
2736                 return;
2737 
2738         if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
2739                 pnv_ioda_setup_bus_dma(pe, pe->pbus);
2740 }
2741 
2742 int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
2743 {
2744         struct pnv_phb *phb = container_of(chip, struct pnv_phb,
2745                                            ioda.irq_chip);
2746 
2747         return opal_pci_msi_eoi(phb->opal_id, hw_irq);
2748 }
2749 
2750 static void pnv_ioda2_msi_eoi(struct irq_data *d)
2751 {
2752         int64_t rc;
2753         unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
2754         struct irq_chip *chip = irq_data_get_irq_chip(d);
2755 
2756         rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
2757         WARN_ON_ONCE(rc);
2758 
2759         icp_native_eoi(d);
2760 }
2761 
2762 
2763 void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
2764 {
2765         struct irq_data *idata;
2766         struct irq_chip *ichip;
2767 
2768         /* The MSI EOI OPAL call is only needed on PHB3 */
2769         if (phb->model != PNV_PHB_MODEL_PHB3)
2770                 return;
2771 
2772         if (!phb->ioda.irq_chip_init) {
2773                 /*
2774                  * First time we setup an MSI IRQ, we need to setup the
2775                  * corresponding IRQ chip to route correctly.
2776                  */
2777                 idata = irq_get_irq_data(virq);
2778                 ichip = irq_data_get_irq_chip(idata);
2779                 phb->ioda.irq_chip_init = 1;
2780                 phb->ioda.irq_chip = *ichip;
2781                 phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
2782         }
2783         irq_set_chip(virq, &phb->ioda.irq_chip);
2784 }
2785 
2786 /*
2787  * Returns true iff chip is something that we could call
2788  * pnv_opal_pci_msi_eoi for.
2789  */
2790 bool is_pnv_opal_msi(struct irq_chip *chip)
2791 {
2792         return chip->irq_eoi == pnv_ioda2_msi_eoi;
2793 }
2794 EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
2795 
2796 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
2797                                   unsigned int hwirq, unsigned int virq,
2798                                   unsigned int is_64, struct msi_msg *msg)
2799 {
2800         struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
2801         unsigned int xive_num = hwirq - phb->msi_base;
2802         __be32 data;
2803         int rc;
2804 
2805         /* No PE assigned ? bail out ... no MSI for you ! */
2806         if (pe == NULL)
2807                 return -ENXIO;
2808 
2809         /* Check if we have an MVE */
2810         if (pe->mve_number < 0)
2811                 return -ENXIO;
2812 
2813         /* Force 32-bit MSI on some broken devices */
2814         if (dev->no_64bit_msi)
2815                 is_64 = 0;
2816 
2817         /* Assign XIVE to PE */
2818         rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
2819         if (rc) {
2820                 pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
2821                         pci_name(dev), rc, xive_num);
2822                 return -EIO;
2823         }
2824 
2825         if (is_64) {
2826                 __be64 addr64;
2827 
2828                 rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
2829                                      &addr64, &data);
2830                 if (rc) {
2831                         pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
2832                                 pci_name(dev), rc);
2833                         return -EIO;
2834                 }
2835                 msg->address_hi = be64_to_cpu(addr64) >> 32;
2836                 msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
2837         } else {
2838                 __be32 addr32;
2839 
2840                 rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
2841                                      &addr32, &data);
2842                 if (rc) {
2843                         pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
2844                                 pci_name(dev), rc);
2845                         return -EIO;
2846                 }
2847                 msg->address_hi = 0;
2848                 msg->address_lo = be32_to_cpu(addr32);
2849         }
2850         msg->data = be32_to_cpu(data);
2851 
2852         pnv_set_msi_irq_chip(phb, virq);
2853 
2854         pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
2855                  " address=%x_%08x data=%x PE# %x\n",
2856                  pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
2857                  msg->address_hi, msg->address_lo, data, pe->pe_number);
2858 
2859         return 0;
2860 }
2861 
2862 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
2863 {
2864         unsigned int count;
2865         const __be32 *prop = of_get_property(phb->hose->dn,
2866                                              "ibm,opal-msi-ranges", NULL);
2867         if (!prop) {
2868                 /* BML Fallback */
2869                 prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
2870         }
2871         if (!prop)
2872                 return;
2873 
2874         phb->msi_base = be32_to_cpup(prop);
2875         count = be32_to_cpup(prop + 1);
2876         if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
2877                 pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
2878                        phb->hose->global_number);
2879                 return;
2880         }
2881 
2882         phb->msi_setup = pnv_pci_ioda_msi_setup;
2883         phb->msi32_support = 1;
2884         pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
2885                 count, phb->msi_base);
2886 }
2887 
2888 #ifdef CONFIG_PCI_IOV
2889 static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
2890 {
2891         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
2892         struct pnv_phb *phb = hose->private_data;
2893         const resource_size_t gate = phb->ioda.m64_segsize >> 2;
2894         struct resource *res;
2895         int i;
2896         resource_size_t size, total_vf_bar_sz;
2897         struct pci_dn *pdn;
2898         int mul, total_vfs;
2899 
2900         if (!pdev->is_physfn || pci_dev_is_added(pdev))
2901                 return;
2902 
2903         pdn = pci_get_pdn(pdev);
2904         pdn->vfs_expanded = 0;
2905         pdn->m64_single_mode = false;
2906 
2907         total_vfs = pci_sriov_get_totalvfs(pdev);
2908         mul = phb->ioda.total_pe_num;
2909         total_vf_bar_sz = 0;
2910 
2911         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2912                 res = &pdev->resource[i + PCI_IOV_RESOURCES];
2913                 if (!res->flags || res->parent)
2914                         continue;
2915                 if (!pnv_pci_is_m64_flags(res->flags)) {
2916                         dev_warn(&pdev->dev, "Don't support SR-IOV with"
2917                                         " non M64 VF BAR%d: %pR. \n",
2918                                  i, res);
2919                         goto truncate_iov;
2920                 }
2921 
2922                 total_vf_bar_sz += pci_iov_resource_size(pdev,
2923                                 i + PCI_IOV_RESOURCES);
2924 
2925                 /*
2926                  * If bigger than quarter of M64 segment size, just round up
2927                  * power of two.
2928                  *
2929                  * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
2930                  * with other devices, IOV BAR size is expanded to be
2931                  * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
2932                  * segment size , the expanded size would equal to half of the
2933                  * whole M64 space size, which will exhaust the M64 Space and
2934                  * limit the system flexibility.  This is a design decision to
2935                  * set the boundary to quarter of the M64 segment size.
2936                  */
2937                 if (total_vf_bar_sz > gate) {
2938                         mul = roundup_pow_of_two(total_vfs);
2939                         dev_info(&pdev->dev,
2940                                 "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
2941                                 total_vf_bar_sz, gate, mul);
2942                         pdn->m64_single_mode = true;
2943                         break;
2944                 }
2945         }
2946 
2947         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2948                 res = &pdev->resource[i + PCI_IOV_RESOURCES];
2949                 if (!res->flags || res->parent)
2950                         continue;
2951 
2952                 size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
2953                 /*
2954                  * On PHB3, the minimum size alignment of M64 BAR in single
2955                  * mode is 32MB.
2956                  */
2957                 if (pdn->m64_single_mode && (size < SZ_32M))
2958                         goto truncate_iov;
2959                 dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
2960                 res->end = res->start + size * mul - 1;
2961                 dev_dbg(&pdev->dev, "                       %pR\n", res);
2962                 dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
2963                          i, res, mul);
2964         }
2965         pdn->vfs_expanded = mul;
2966 
2967         return;
2968 
2969 truncate_iov:
2970         /* To save MMIO space, IOV BAR is truncated. */
2971         for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
2972                 res = &pdev->resource[i + PCI_IOV_RESOURCES];
2973                 res->flags = 0;
2974                 res->end = res->start - 1;
2975         }
2976 }
2977 #endif /* CONFIG_PCI_IOV */
2978 
2979 static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
2980                                   struct resource *res)
2981 {
2982         struct pnv_phb *phb = pe->phb;
2983         struct pci_bus_region region;
2984         int index;
2985         int64_t rc;
2986 
2987         if (!res || !res->flags || res->start > res->end)
2988                 return;
2989 
2990         if (res->flags & IORESOURCE_IO) {
2991                 region.start = res->start - phb->ioda.io_pci_base;
2992                 region.end   = res->end - phb->ioda.io_pci_base;
2993                 index = region.start / phb->ioda.io_segsize;
2994 
2995                 while (index < phb->ioda.total_pe_num &&
2996                        region.start <= region.end) {
2997                         phb->ioda.io_segmap[index] = pe->pe_number;
2998                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
2999                                 pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
3000                         if (rc != OPAL_SUCCESS) {
3001                                 pr_err("%s: Error %lld mapping IO segment#%d to PE#%x\n",
3002                                        __func__, rc, index, pe->pe_number);
3003                                 break;
3004                         }
3005 
3006                         region.start += phb->ioda.io_segsize;
3007                         index++;
3008                 }
3009         } else if ((res->flags & IORESOURCE_MEM) &&
3010                    !pnv_pci_is_m64(phb, res)) {
3011                 region.start = res->start -
3012                                phb->hose->mem_offset[0] -
3013                                phb->ioda.m32_pci_base;
3014                 region.end   = res->end -
3015                                phb->hose->mem_offset[0] -
3016                                phb->ioda.m32_pci_base;
3017                 index = region.start / phb->ioda.m32_segsize;
3018 
3019                 while (index < phb->ioda.total_pe_num &&
3020                        region.start <= region.end) {
3021                         phb->ioda.m32_segmap[index] = pe->pe_number;
3022                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3023                                 pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
3024                         if (rc != OPAL_SUCCESS) {
3025                                 pr_err("%s: Error %lld mapping M32 segment#%d to PE#%x",
3026                                        __func__, rc, index, pe->pe_number);
3027                                 break;
3028                         }
3029 
3030                         region.start += phb->ioda.m32_segsize;
3031                         index++;
3032                 }
3033         }
3034 }
3035 
3036 /*
3037  * This function is supposed to be called on basis of PE from top
3038  * to bottom style. So the the I/O or MMIO segment assigned to
3039  * parent PE could be overridden by its child PEs if necessary.
3040  */
3041 static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
3042 {
3043         struct pci_dev *pdev;
3044         int i;
3045 
3046         /*
3047          * NOTE: We only care PCI bus based PE for now. For PCI
3048          * device based PE, for example SRIOV sensitive VF should
3049          * be figured out later.
3050          */
3051         BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
3052 
3053         list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
3054                 for (i = 0; i <= PCI_ROM_RESOURCE; i++)
3055                         pnv_ioda_setup_pe_res(pe, &pdev->resource[i]);
3056 
3057                 /*
3058                  * If the PE contains all subordinate PCI buses, the
3059                  * windows of the child bridges should be mapped to
3060                  * the PE as well.
3061                  */
3062                 if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev))
3063                         continue;
3064                 for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++)
3065                         pnv_ioda_setup_pe_res(pe,
3066                                 &pdev->resource[PCI_BRIDGE_RESOURCES + i]);
3067         }
3068 }
3069 
3070 #ifdef CONFIG_DEBUG_FS
3071 static int pnv_pci_diag_data_set(void *data, u64 val)
3072 {
3073         struct pci_controller *hose;
3074         struct pnv_phb *phb;
3075         s64 ret;
3076 
3077         if (val != 1ULL)
3078                 return -EINVAL;
3079 
3080         hose = (struct pci_controller *)data;
3081         if (!hose || !hose->private_data)
3082                 return -ENODEV;
3083 
3084         phb = hose->private_data;
3085 
3086         /* Retrieve the diag data from firmware */
3087         ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
3088                                           phb->diag_data_size);
3089         if (ret != OPAL_SUCCESS)
3090                 return -EIO;
3091 
3092         /* Print the diag data to the kernel log */
3093         pnv_pci_dump_phb_diag_data(phb->hose, phb->diag_data);
3094         return 0;
3095 }
3096 
3097 DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL,
3098                         pnv_pci_diag_data_set, "%llu\n");
3099 
3100 #endif /* CONFIG_DEBUG_FS */
3101 
3102 static void pnv_pci_ioda_create_dbgfs(void)
3103 {
3104 #ifdef CONFIG_DEBUG_FS
3105         struct pci_controller *hose, *tmp;
3106         struct pnv_phb *phb;
3107         char name[16];
3108 
3109         list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
3110                 phb = hose->private_data;
3111 
3112                 /* Notify initialization of PHB done */
3113                 phb->initialized = 1;
3114 
3115                 sprintf(name, "PCI%04x", hose->global_number);
3116                 phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
3117                 if (!phb->dbgfs) {
3118                         pr_warn("%s: Error on creating debugfs on PHB#%x\n",
3119                                 __func__, hose->global_number);
3120                         continue;
3121                 }
3122 
3123                 debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose,
3124                                     &pnv_pci_diag_data_fops);
3125         }
3126 #endif /* CONFIG_DEBUG_FS */
3127 }
3128 
3129 static void pnv_pci_enable_bridge(struct pci_bus *bus)
3130 {
3131         struct pci_dev *dev = bus->self;
3132         struct pci_bus *child;
3133 
3134         /* Empty bus ? bail */
3135         if (list_empty(&bus->devices))
3136                 return;
3137 
3138         /*
3139          * If there's a bridge associated with that bus enable it. This works
3140          * around races in the generic code if the enabling is done during
3141          * parallel probing. This can be removed once those races have been
3142          * fixed.
3143          */
3144         if (dev) {
3145                 int rc = pci_enable_device(dev);
3146                 if (rc)
3147                         pci_err(dev, "Error enabling bridge (%d)\n", rc);
3148                 pci_set_master(dev);
3149         }
3150 
3151         /* Perform the same to child busses */
3152         list_for_each_entry(child, &bus->children, node)
3153                 pnv_pci_enable_bridge(child);
3154 }
3155 
3156 static void pnv_pci_enable_bridges(void)
3157 {
3158         struct pci_controller *hose;
3159 
3160         list_for_each_entry(hose, &hose_list, list_node)
3161                 pnv_pci_enable_bridge(hose->bus);
3162 }
3163 
3164 static void pnv_pci_ioda_fixup(void)
3165 {
3166         pnv_pci_ioda_setup_PEs();
3167         pnv_pci_ioda_setup_iommu_api();
3168         pnv_pci_ioda_create_dbgfs();
3169 
3170         pnv_pci_enable_bridges();
3171 
3172 #ifdef CONFIG_EEH
3173         pnv_eeh_post_init();
3174 #endif
3175 }
3176 
3177 /*
3178  * Returns the alignment for I/O or memory windows for P2P
3179  * bridges. That actually depends on how PEs are segmented.
3180  * For now, we return I/O or M32 segment size for PE sensitive
3181  * P2P bridges. Otherwise, the default values (4KiB for I/O,
3182  * 1MiB for memory) will be returned.
3183  *
3184  * The current PCI bus might be put into one PE, which was
3185  * create against the parent PCI bridge. For that case, we
3186  * needn't enlarge the alignment so that we can save some
3187  * resources.
3188  */
3189 static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
3190                                                 unsigned long type)
3191 {
3192         struct pci_dev *bridge;
3193         struct pci_controller *hose = pci_bus_to_host(bus);
3194         struct pnv_phb *phb = hose->private_data;
3195         int num_pci_bridges = 0;
3196 
3197         bridge = bus->self;
3198         while (bridge) {
3199                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
3200                         num_pci_bridges++;
3201                         if (num_pci_bridges >= 2)
3202                                 return 1;
3203                 }
3204 
3205                 bridge = bridge->bus->self;
3206         }
3207 
3208         /*
3209          * We fall back to M32 if M64 isn't supported. We enforce the M64
3210          * alignment for any 64-bit resource, PCIe doesn't care and
3211          * bridges only do 64-bit prefetchable anyway.
3212          */
3213         if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type))
3214                 return phb->ioda.m64_segsize;
3215         if (type & IORESOURCE_MEM)
3216                 return phb->ioda.m32_segsize;
3217 
3218         return phb->ioda.io_segsize;
3219 }
3220 
3221 /*
3222  * We are updating root port or the upstream port of the
3223  * bridge behind the root port with PHB's windows in order
3224  * to accommodate the changes on required resources during
3225  * PCI (slot) hotplug, which is connected to either root
3226  * port or the downstream ports of PCIe switch behind the
3227  * root port.
3228  */
3229 static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
3230                                            unsigned long type)
3231 {
3232         struct pci_controller *hose = pci_bus_to_host(bus);
3233         struct pnv_phb *phb = hose->private_data;
3234         struct pci_dev *bridge = bus->self;
3235         struct resource *r, *w;
3236         bool msi_region = false;
3237         int i;
3238 
3239         /* Check if we need apply fixup to the bridge's windows */
3240         if (!pci_is_root_bus(bridge->bus) &&
3241             !pci_is_root_bus(bridge->bus->self->bus))
3242                 return;
3243 
3244         /* Fixup the resources */
3245         for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
3246                 r = &bridge->resource[PCI_BRIDGE_RESOURCES + i];
3247                 if (!r->flags || !r->parent)
3248                         continue;
3249 
3250                 w = NULL;
3251                 if (r->flags & type & IORESOURCE_IO)
3252                         w = &hose->io_resource;
3253                 else if (pnv_pci_is_m64(phb, r) &&
3254                          (type & IORESOURCE_PREFETCH) &&
3255                          phb->ioda.m64_segsize)
3256                         w = &hose->mem_resources[1];
3257                 else if (r->flags & type & IORESOURCE_MEM) {
3258                         w = &hose->mem_resources[0];
3259                         msi_region = true;
3260                 }
3261 
3262                 r->start = w->start;
3263                 r->end = w->end;
3264 
3265                 /* The 64KB 32-bits MSI region shouldn't be included in
3266                  * the 32-bits bridge window. Otherwise, we can see strange
3267                  * issues. One of them is EEH error observed on Garrison.
3268                  *
3269                  * Exclude top 1MB region which is the minimal alignment of
3270                  * 32-bits bridge window.
3271                  */
3272                 if (msi_region) {
3273                         r->end += 0x10000;
3274                         r->end -= 0x100000;
3275                 }
3276         }
3277 }
3278 
3279 static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
3280 {
3281         struct pci_controller *hose = pci_bus_to_host(bus);
3282         struct pnv_phb *phb = hose->private_data;
3283         struct pci_dev *bridge = bus->self;
3284         struct pnv_ioda_pe *pe;
3285         bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
3286 
3287         /* Extend bridge's windows if necessary */
3288         pnv_pci_fixup_bridge_resources(bus, type);
3289 
3290         /* The PE for root bus should be realized before any one else */
3291         if (!phb->ioda.root_pe_populated) {
3292                 pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false);
3293                 if (pe) {
3294                         phb->ioda.root_pe_idx = pe->pe_number;
3295                         phb->ioda.root_pe_populated = true;
3296                 }
3297         }
3298 
3299         /* Don't assign PE to PCI bus, which doesn't have subordinate devices */
3300         if (list_empty(&bus->devices))
3301                 return;
3302 
3303         /* Reserve PEs according to used M64 resources */
3304         pnv_ioda_reserve_m64_pe(bus, NULL, all);
3305 
3306         /*
3307          * Assign PE. We might run here because of partial hotplug.
3308          * For the case, we just pick up the existing PE and should
3309          * not allocate resources again.
3310          */
3311         pe = pnv_ioda_setup_bus_PE(bus, all);
3312         if (!pe)
3313                 return;
3314 
3315         pnv_ioda_setup_pe_seg(pe);
3316         switch (phb->type) {
3317         case PNV_PHB_IODA1:
3318                 pnv_pci_ioda1_setup_dma_pe(phb, pe);
3319                 break;
3320         case PNV_PHB_IODA2:
3321                 pnv_pci_ioda2_setup_dma_pe(phb, pe);
3322                 break;
3323         default:
3324                 pr_warn("%s: No DMA for PHB#%x (type %d)\n",
3325                         __func__, phb->hose->global_number, phb->type);
3326         }
3327 }
3328 
3329 static resource_size_t pnv_pci_default_alignment(void)
3330 {
3331         return PAGE_SIZE;
3332 }
3333 
3334 #ifdef CONFIG_PCI_IOV
3335 static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
3336                                                       int resno)
3337 {
3338         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
3339         struct pnv_phb *phb = hose->private_data;
3340         struct pci_dn *pdn = pci_get_pdn(pdev);
3341         resource_size_t align;
3342 
3343         /*
3344          * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
3345          * SR-IOV. While from hardware perspective, the range mapped by M64
3346          * BAR should be size aligned.
3347          *
3348          * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
3349          * powernv-specific hardware restriction is gone. But if just use the
3350          * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
3351          * in one segment of M64 #15, which introduces the PE conflict between
3352          * PF and VF. Based on this, the minimum alignment of an IOV BAR is
3353          * m64_segsize.
3354          *
3355          * This function returns the total IOV BAR size if M64 BAR is in
3356          * Shared PE mode or just VF BAR size if not.
3357          * If the M64 BAR is in Single PE mode, return the VF BAR size or
3358          * M64 segment size if IOV BAR size is less.
3359          */
3360         align = pci_iov_resource_size(pdev, resno);
3361         if (!pdn->vfs_expanded)
3362                 return align;
3363         if (pdn->m64_single_mode)
3364                 return max(align, (resource_size_t)phb->ioda.m64_segsize);
3365 
3366         return pdn->vfs_expanded * align;
3367 }
3368 #endif /* CONFIG_PCI_IOV */
3369 
3370 /* Prevent enabling devices for which we couldn't properly
3371  * assign a PE
3372  */
3373 static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
3374 {
3375         struct pci_controller *hose = pci_bus_to_host(dev->bus);
3376         struct pnv_phb *phb = hose->private_data;
3377         struct pci_dn *pdn;
3378 
3379         /* The function is probably called while the PEs have
3380          * not be created yet. For example, resource reassignment
3381          * during PCI probe period. We just skip the check if
3382          * PEs isn't ready.
3383          */
3384         if (!phb->initialized)
3385                 return true;
3386 
3387         pdn = pci_get_pdn(dev);
3388         if (!pdn || pdn->pe_number == IODA_INVALID_PE)
3389                 return false;
3390 
3391         return true;
3392 }
3393 
3394 static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
3395                                        int num)
3396 {
3397         struct pnv_ioda_pe *pe = container_of(table_group,
3398                                               struct pnv_ioda_pe, table_group);
3399         struct pnv_phb *phb = pe->phb;
3400         unsigned int idx;
3401         long rc;
3402 
3403         pe_info(pe, "Removing DMA window #%d\n", num);
3404         for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
3405                 if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
3406                         continue;
3407 
3408                 rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
3409                                                 idx, 0, 0ul, 0ul, 0ul);
3410                 if (rc != OPAL_SUCCESS) {
3411                         pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
3412                                 rc, idx);
3413                         return rc;
3414                 }
3415 
3416                 phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
3417         }
3418 
3419         pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
3420         return OPAL_SUCCESS;
3421 }
3422 
3423 static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
3424 {
3425         unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
3426         struct iommu_table *tbl = pe->table_group.tables[0];
3427         int64_t rc;
3428 
3429         if (!weight)
3430                 return;
3431 
3432         rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
3433         if (rc != OPAL_SUCCESS)
3434                 return;
3435 
3436         pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
3437         if (pe->table_group.group) {
3438                 iommu_group_put(pe->table_group.group);
3439                 WARN_ON(pe->table_group.group);
3440         }
3441 
3442         free_pages(tbl->it_base, get_order(tbl->it_size << 3));
3443         iommu_tce_table_put(tbl);
3444 }
3445 
3446 static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
3447 {
3448         struct iommu_table *tbl = pe->table_group.tables[0];
3449         unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
3450 #ifdef CONFIG_IOMMU_API
3451         int64_t rc;
3452 #endif
3453 
3454         if (!weight)
3455                 return;
3456 
3457 #ifdef CONFIG_IOMMU_API
3458         rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
3459         if (rc)
3460                 pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
3461 #endif
3462 
3463         pnv_pci_ioda2_set_bypass(pe, false);
3464         if (pe->table_group.group) {
3465                 iommu_group_put(pe->table_group.group);
3466                 WARN_ON(pe->table_group.group);
3467         }
3468 
3469         iommu_tce_table_put(tbl);
3470 }
3471 
3472 static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
3473                                  unsigned short win,
3474                                  unsigned int *map)
3475 {
3476         struct pnv_phb *phb = pe->phb;
3477         int idx;
3478         int64_t rc;
3479 
3480         for (idx = 0; idx < phb->ioda.total_pe_num; idx++) {
3481                 if (map[idx] != pe->pe_number)
3482                         continue;
3483 
3484                 if (win == OPAL_M64_WINDOW_TYPE)
3485                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3486                                         phb->ioda.reserved_pe_idx, win,
3487                                         idx / PNV_IODA1_M64_SEGS,
3488                                         idx % PNV_IODA1_M64_SEGS);
3489                 else
3490                         rc = opal_pci_map_pe_mmio_window(phb->opal_id,
3491                                         phb->ioda.reserved_pe_idx, win, 0, idx);
3492 
3493                 if (rc != OPAL_SUCCESS)
3494                         pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
3495                                 rc, win, idx);
3496 
3497                 map[idx] = IODA_INVALID_PE;
3498         }
3499 }
3500 
3501 static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
3502 {
3503         struct pnv_phb *phb = pe->phb;
3504 
3505         if (phb->type == PNV_PHB_IODA1) {
3506                 pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
3507                                      phb->ioda.io_segmap);
3508                 pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
3509                                      phb->ioda.m32_segmap);
3510                 pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE,
3511                                      phb->ioda.m64_segmap);
3512         } else if (phb->type == PNV_PHB_IODA2) {
3513                 pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
3514                                      phb->ioda.m32_segmap);
3515         }
3516 }
3517 
3518 static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
3519 {
3520         struct pnv_phb *phb = pe->phb;
3521         struct pnv_ioda_pe *slave, *tmp;
3522 
3523         list_del(&pe->list);
3524         switch (phb->type) {
3525         case PNV_PHB_IODA1:
3526                 pnv_pci_ioda1_release_pe_dma(pe);
3527                 break;
3528         case PNV_PHB_IODA2:
3529                 pnv_pci_ioda2_release_pe_dma(pe);
3530                 break;
3531         default:
3532                 WARN_ON(1);
3533         }
3534 
3535         pnv_ioda_release_pe_seg(pe);
3536         pnv_ioda_deconfigure_pe(pe->phb, pe);
3537 
3538         /* Release slave PEs in the compound PE */
3539         if (pe->flags & PNV_IODA_PE_MASTER) {
3540                 list_for_each_entry_safe(slave, tmp, &pe->slaves, list) {
3541                         list_del(&slave->list);
3542                         pnv_ioda_free_pe(slave);
3543                 }
3544         }
3545 
3546         /*
3547          * The PE for root bus can be removed because of hotplug in EEH
3548          * recovery for fenced PHB error. We need to mark the PE dead so
3549          * that it can be populated again in PCI hot add path. The PE
3550          * shouldn't be destroyed as it's the global reserved resource.
3551          */
3552         if (phb->ioda.root_pe_populated &&
3553             phb->ioda.root_pe_idx == pe->pe_number)
3554                 phb->ioda.root_pe_populated = false;
3555         else
3556                 pnv_ioda_free_pe(pe);
3557 }
3558 
3559 static void pnv_pci_release_device(struct pci_dev *pdev)
3560 {
3561         struct pci_controller *hose = pci_bus_to_host(pdev->bus);
3562         struct pnv_phb *phb = hose->private_data;
3563         struct pci_dn *pdn = pci_get_pdn(pdev);
3564         struct pnv_ioda_pe *pe;
3565 
3566         if (pdev->is_virtfn)
3567                 return;
3568 
3569         if (!pdn || pdn->pe_number == IODA_INVALID_PE)
3570                 return;
3571 
3572         /*
3573          * PCI hotplug can happen as part of EEH error recovery. The @pdn
3574          * isn't removed and added afterwards in this scenario. We should
3575          * set the PE number in @pdn to an invalid one. Otherwise, the PE's
3576          * device count is decreased on removing devices while failing to
3577          * be increased on adding devices. It leads to unbalanced PE's device
3578          * count and eventually make normal PCI hotplug path broken.
3579          */
3580         pe = &phb->ioda.pe_array[pdn->pe_number];
3581         pdn->pe_number = IODA_INVALID_PE;
3582 
3583         WARN_ON(--pe->device_count < 0);
3584         if (pe->device_count == 0)
3585                 pnv_ioda_release_pe(pe);
3586 }
3587 
3588 static void pnv_npu_disable_device(struct pci_dev *pdev)
3589 {
3590         struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
3591         struct eeh_pe *eehpe = edev ? edev->pe : NULL;
3592 
3593         if (eehpe && eeh_ops && eeh_ops->reset)
3594                 eeh_ops->reset(eehpe, EEH_RESET_HOT);
3595 }
3596 
3597 static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
3598 {
3599         struct pnv_phb *phb = hose->private_data;
3600 
3601         opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
3602                        OPAL_ASSERT_RESET);
3603 }
3604 
3605 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
3606         .dma_dev_setup          = pnv_pci_dma_dev_setup,
3607         .dma_bus_setup          = pnv_pci_dma_bus_setup,
3608         .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
3609         .setup_msi_irqs         = pnv_setup_msi_irqs,
3610         .teardown_msi_irqs      = pnv_teardown_msi_irqs,
3611         .enable_device_hook     = pnv_pci_enable_device_hook,
3612         .release_device         = pnv_pci_release_device,
3613         .window_alignment       = pnv_pci_window_alignment,
3614         .setup_bridge           = pnv_pci_setup_bridge,
3615         .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
3616         .shutdown               = pnv_pci_ioda_shutdown,
3617 };
3618 
3619 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
3620         .dma_dev_setup          = pnv_pci_dma_dev_setup,
3621         .setup_msi_irqs         = pnv_setup_msi_irqs,
3622         .teardown_msi_irqs      = pnv_teardown_msi_irqs,
3623         .enable_device_hook     = pnv_pci_enable_device_hook,
3624         .window_alignment       = pnv_pci_window_alignment,
3625         .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
3626         .shutdown               = pnv_pci_ioda_shutdown,
3627         .disable_device         = pnv_npu_disable_device,
3628 };
3629 
3630 static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
3631         .enable_device_hook     = pnv_pci_enable_device_hook,
3632         .window_alignment       = pnv_pci_window_alignment,
3633         .reset_secondary_bus    = pnv_pci_reset_secondary_bus,
3634         .shutdown               = pnv_pci_ioda_shutdown,
3635 };
3636 
3637 static void __init pnv_pci_init_ioda_phb(struct device_node *np,
3638                                          u64 hub_id, int ioda_type)
3639 {
3640         struct pci_controller *hose;
3641         struct pnv_phb *phb;
3642         unsigned long size, m64map_off, m32map_off, pemap_off;
3643         unsigned long iomap_off = 0, dma32map_off = 0;
3644         struct resource r;
3645         const __be64 *prop64;
3646         const __be32 *prop32;
3647         int len;
3648         unsigned int segno;
3649         u64 phb_id;
3650         void *aux;
3651         long rc;
3652 
3653         if (!of_device_is_available(np))
3654                 return;
3655 
3656         pr_info("Initializing %s PHB (%pOF)\n", pnv_phb_names[ioda_type], np);
3657 
3658         prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
3659         if (!prop64) {
3660                 pr_err("  Missing \"ibm,opal-phbid\" property !\n");
3661                 return;
3662         }
3663         phb_id = be64_to_cpup(prop64);
3664         pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
3665 
3666         phb = memblock_alloc(sizeof(*phb), SMP_CACHE_BYTES);
3667         if (!phb)
3668                 panic("%s: Failed to allocate %zu bytes\n", __func__,
3669                       sizeof(*phb));
3670 
3671         /* Allocate PCI controller */
3672         phb->hose = hose = pcibios_alloc_controller(np);
3673         if (!phb->hose) {
3674                 pr_err("  Can't allocate PCI controller for %pOF\n",
3675                        np);
3676                 memblock_free(__pa(phb), sizeof(struct pnv_phb));
3677                 return;
3678         }
3679 
3680         spin_lock_init(&phb->lock);
3681         prop32 = of_get_property(np, "bus-range", &len);
3682         if (prop32 && len == 8) {
3683                 hose->first_busno = be32_to_cpu(prop32[0]);
3684                 hose->last_busno = be32_to_cpu(prop32[1]);
3685         } else {
3686                 pr_warn("  Broken <bus-range> on %pOF\n", np);
3687                 hose->first_busno = 0;
3688                 hose->last_busno = 0xff;
3689         }
3690         hose->private_data = phb;
3691         phb->hub_id = hub_id;
3692         phb->opal_id = phb_id;
3693         phb->type = ioda_type;
3694         mutex_init(&phb->ioda.pe_alloc_mutex);
3695 
3696         /* Detect specific models for error handling */
3697         if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
3698                 phb->model = PNV_PHB_MODEL_P7IOC;
3699         else if (of_device_is_compatible(np, "ibm,power8-pciex"))
3700                 phb->model = PNV_PHB_MODEL_PHB3;
3701         else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
3702                 phb->model = PNV_PHB_MODEL_NPU;
3703         else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
3704                 phb->model = PNV_PHB_MODEL_NPU2;
3705         else
3706                 phb->model = PNV_PHB_MODEL_UNKNOWN;
3707 
3708         /* Initialize diagnostic data buffer */
3709         prop32 = of_get_property(np, "ibm,phb-diag-data-size", NULL);
3710         if (prop32)
3711                 phb->diag_data_size = be32_to_cpup(prop32);
3712         else
3713                 phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
3714 
3715         phb->diag_data = memblock_alloc(phb->diag_data_size, SMP_CACHE_BYTES);
3716         if (!phb->diag_data)
3717                 panic("%s: Failed to allocate %u bytes\n", __func__,
3718                       phb->diag_data_size);
3719 
3720         /* Parse 32-bit and IO ranges (if any) */
3721         pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
3722 
3723         /* Get registers */
3724         if (!of_address_to_resource(np, 0, &r)) {
3725                 phb->regs_phys = r.start;
3726                 phb->regs = ioremap(r.start, resource_size(&r));
3727                 if (phb->regs == NULL)
3728                         pr_err("  Failed to map registers !\n");
3729         }
3730 
3731         /* Initialize more IODA stuff */
3732         phb->ioda.total_pe_num = 1;
3733         prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
3734         if (prop32)
3735                 phb->ioda.total_pe_num = be32_to_cpup(prop32);
3736         prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
3737         if (prop32)
3738                 phb->ioda.reserved_pe_idx = be32_to_cpup(prop32);
3739 
3740         /* Invalidate RID to PE# mapping */
3741         for (segno = 0; segno < ARRAY_SIZE(phb->ioda.pe_rmap); segno++)
3742                 phb->ioda.pe_rmap[segno] = IODA_INVALID_PE;
3743 
3744         /* Parse 64-bit MMIO range */
3745         pnv_ioda_parse_m64_window(phb);
3746 
3747         phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
3748         /* FW Has already off top 64k of M32 space (MSI space) */
3749         phb->ioda.m32_size += 0x10000;
3750 
3751         phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num;
3752         phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
3753         phb->ioda.io_size = hose->pci_io_size;
3754         phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
3755         phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
3756 
3757         /* Calculate how many 32-bit TCE segments we have */
3758         phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3759                                 PNV_IODA1_DMA32_SEGSIZE;
3760 
3761         /* Allocate aux data & arrays. We don't have IO ports on PHB3 */
3762         size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
3763                         sizeof(unsigned long));
3764         m64map_off = size;
3765         size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
3766         m32map_off = size;
3767         size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
3768         if (phb->type == PNV_PHB_IODA1) {
3769                 iomap_off = size;
3770                 size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
3771                 dma32map_off = size;
3772                 size += phb->ioda.dma32_count *
3773                         sizeof(phb->ioda.dma32_segmap[0]);
3774         }
3775         pemap_off = size;
3776         size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
3777         aux = memblock_alloc(size, SMP_CACHE_BYTES);
3778         if (!aux)
3779                 panic("%s: Failed to allocate %lu bytes\n", __func__, size);
3780         phb->ioda.pe_alloc = aux;
3781         phb->ioda.m64_segmap = aux + m64map_off;
3782         phb->ioda.m32_segmap = aux + m32map_off;
3783         for (segno = 0; segno < phb->ioda.total_pe_num; segno++) {
3784                 phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
3785                 phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
3786         }
3787         if (phb->type == PNV_PHB_IODA1) {
3788                 phb->ioda.io_segmap = aux + iomap_off;
3789                 for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
3790                         phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
3791 
3792                 phb->ioda.dma32_segmap = aux + dma32map_off;
3793                 for (segno = 0; segno < phb->ioda.dma32_count; segno++)
3794                         phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
3795         }
3796         phb->ioda.pe_array = aux + pemap_off;
3797 
3798         /*
3799          * Choose PE number for root bus, which shouldn't have
3800          * M64 resources consumed by its child devices. To pick
3801          * the PE number adjacent to the reserved one if possible.
3802          */
3803         pnv_ioda_reserve_pe(phb, phb->ioda.reserved_pe_idx);
3804         if (phb->ioda.reserved_pe_idx == 0) {
3805                 phb->ioda.root_pe_idx = 1;
3806                 pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3807         } else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) {
3808                 phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
3809                 pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
3810         } else {
3811                 phb->ioda.root_pe_idx = IODA_INVALID_PE;
3812         }
3813 
3814         INIT_LIST_HEAD(&phb->ioda.pe_list);
3815         mutex_init(&phb->ioda.pe_list_mutex);
3816 
3817         /* Calculate how many 32-bit TCE segments we have */
3818         phb->ioda.dma32_count = phb->ioda.m32_pci_base /
3819                                 PNV_IODA1_DMA32_SEGSIZE;
3820 
3821 #if 0 /* We should really do that ... */
3822         rc = opal_pci_set_phb_mem_window(opal->phb_id,
3823                                          window_type,
3824                                          window_num,
3825                                          starting_real_address,
3826                                          starting_pci_address,
3827                                          segment_size);
3828 #endif
3829 
3830         pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
3831                 phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx,
3832                 phb->ioda.m32_size, phb->ioda.m32_segsize);
3833         if (phb->ioda.m64_size)
3834                 pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
3835                         phb->ioda.m64_size, phb->ioda.m64_segsize);
3836         if (phb->ioda.io_size)
3837                 pr_info("                  IO: 0x%x [segment=0x%x]\n",
3838                         phb->ioda.io_size, phb->ioda.io_segsize);
3839 
3840 
3841         phb->hose->ops = &pnv_pci_ops;
3842         phb->get_pe_state = pnv_ioda_get_pe_state;
3843         phb->freeze_pe = pnv_ioda_freeze_pe;
3844         phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
3845 
3846         /* Setup MSI support */
3847         pnv_pci_init_ioda_msis(phb);
3848 
3849         /*
3850          * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
3851          * to let the PCI core do resource assignment. It's supposed
3852          * that the PCI core will do correct I/O and MMIO alignment
3853          * for the P2P bridge bars so that each PCI bus (excluding
3854          * the child P2P bridges) can form individual PE.
3855          */
3856         ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
3857 
3858         switch (phb->type) {
3859         case PNV_PHB_NPU_NVLINK:
3860                 hose->controller_ops = pnv_npu_ioda_controller_ops;
3861                 break;
3862         case PNV_PHB_NPU_OCAPI:
3863                 hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
3864                 break;
3865         default:
3866                 phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
3867                 hose->controller_ops = pnv_pci_ioda_controller_ops;
3868         }
3869 
3870         ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
3871 
3872 #ifdef CONFIG_PCI_IOV
3873         ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
3874         ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
3875         ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
3876         ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
3877 #endif
3878 
3879         pci_add_flags(PCI_REASSIGN_ALL_RSRC);
3880 
3881         /* Reset IODA tables to a clean state */
3882         rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
3883         if (rc)
3884                 pr_warn("  OPAL Error %ld performing IODA table reset !\n", rc);
3885 
3886         /*
3887          * If we're running in kdump kernel, the previous kernel never
3888          * shutdown PCI devices correctly. We already got IODA table
3889          * cleaned out. So we have to issue PHB reset to stop all PCI
3890          * transactions from previous kernel. The ppc_pci_reset_phbs
3891          * kernel parameter will force this reset too. Additionally,
3892          * if the IODA reset above failed then use a bigger hammer.
3893          * This can happen if we get a PHB fatal error in very early
3894          * boot.
3895          */
3896         if (is_kdump_kernel() || pci_reset_phbs || rc) {
3897                 pr_info("  Issue PHB reset ...\n");
3898                 pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
3899                 pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
3900         }
3901 
3902         /* Remove M64 resource if we can't configure it successfully */
3903         if (!phb->init_m64 || phb->init_m64(phb))
3904                 hose->mem_resources[1].flags = 0;
3905 }
3906 
3907 void __init pnv_pci_init_ioda2_phb(struct device_node *np)
3908 {
3909         pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
3910 }
3911 
3912 void __init pnv_pci_init_npu_phb(struct device_node *np)
3913 {
3914         pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
3915 }
3916 
3917 void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
3918 {
3919         pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
3920 }
3921 
3922 static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
3923 {
3924         struct pci_controller *hose = pci_bus_to_host(dev->bus);
3925         struct pnv_phb *phb = hose->private_data;
3926 
3927         if (!machine_is(powernv))
3928                 return;
3929 
3930         if (phb->type == PNV_PHB_NPU_OCAPI)
3931                 dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
3932 }
3933 DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup);
3934 
3935 void __init pnv_pci_init_ioda_hub(struct device_node *np)
3936 {
3937         struct device_node *phbn;
3938         const __be64 *prop64;
3939         u64 hub_id;
3940 
3941         pr_info("Probing IODA IO-Hub %pOF\n", np);
3942 
3943         prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
3944         if (!prop64) {
3945                 pr_err(" Missing \"ibm,opal-hubid\" property !\n");
3946                 return;
3947         }
3948         hub_id = be64_to_cpup(prop64);
3949         pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
3950 
3951         /* Count child PHBs */
3952         for_each_child_of_node(np, phbn) {
3953                 /* Look for IODA1 PHBs */
3954                 if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
3955                         pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
3956         }
3957 }
3958 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp