~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/powerpc/mm/book3s64/radix_pgtable.c

Version: ~ [ linux-5.16-rc3 ] ~ [ linux-5.15.5 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.82 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.162 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.218 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.256 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.291 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.293 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * Page table handling routines for radix page table.
  4  *
  5  * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
  6  */
  7 
  8 #define pr_fmt(fmt) "radix-mmu: " fmt
  9 
 10 #include <linux/kernel.h>
 11 #include <linux/sched/mm.h>
 12 #include <linux/memblock.h>
 13 #include <linux/of_fdt.h>
 14 #include <linux/mm.h>
 15 #include <linux/string_helpers.h>
 16 #include <linux/stop_machine.h>
 17 
 18 #include <asm/pgtable.h>
 19 #include <asm/pgalloc.h>
 20 #include <asm/mmu_context.h>
 21 #include <asm/dma.h>
 22 #include <asm/machdep.h>
 23 #include <asm/mmu.h>
 24 #include <asm/firmware.h>
 25 #include <asm/powernv.h>
 26 #include <asm/sections.h>
 27 #include <asm/trace.h>
 28 #include <asm/uaccess.h>
 29 
 30 #include <trace/events/thp.h>
 31 
 32 unsigned int mmu_pid_bits;
 33 unsigned int mmu_base_pid;
 34 
 35 static int native_register_process_table(unsigned long base, unsigned long pg_sz,
 36                                          unsigned long table_size)
 37 {
 38         unsigned long patb0, patb1;
 39 
 40         patb0 = be64_to_cpu(partition_tb[0].patb0);
 41         patb1 = base | table_size | PATB_GR;
 42 
 43         mmu_partition_table_set_entry(0, patb0, patb1);
 44 
 45         return 0;
 46 }
 47 
 48 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
 49                         unsigned long region_start, unsigned long region_end)
 50 {
 51         phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
 52         phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
 53         void *ptr;
 54 
 55         if (region_start)
 56                 min_addr = region_start;
 57         if (region_end)
 58                 max_addr = region_end;
 59 
 60         ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
 61 
 62         if (!ptr)
 63                 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
 64                       __func__, size, size, nid, &min_addr, &max_addr);
 65 
 66         return ptr;
 67 }
 68 
 69 static int early_map_kernel_page(unsigned long ea, unsigned long pa,
 70                           pgprot_t flags,
 71                           unsigned int map_page_size,
 72                           int nid,
 73                           unsigned long region_start, unsigned long region_end)
 74 {
 75         unsigned long pfn = pa >> PAGE_SHIFT;
 76         pgd_t *pgdp;
 77         pud_t *pudp;
 78         pmd_t *pmdp;
 79         pte_t *ptep;
 80 
 81         pgdp = pgd_offset_k(ea);
 82         if (pgd_none(*pgdp)) {
 83                 pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
 84                                                 region_start, region_end);
 85                 pgd_populate(&init_mm, pgdp, pudp);
 86         }
 87         pudp = pud_offset(pgdp, ea);
 88         if (map_page_size == PUD_SIZE) {
 89                 ptep = (pte_t *)pudp;
 90                 goto set_the_pte;
 91         }
 92         if (pud_none(*pudp)) {
 93                 pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
 94                                                 region_start, region_end);
 95                 pud_populate(&init_mm, pudp, pmdp);
 96         }
 97         pmdp = pmd_offset(pudp, ea);
 98         if (map_page_size == PMD_SIZE) {
 99                 ptep = pmdp_ptep(pmdp);
100                 goto set_the_pte;
101         }
102         if (!pmd_present(*pmdp)) {
103                 ptep = early_alloc_pgtable(PAGE_SIZE, nid,
104                                                 region_start, region_end);
105                 pmd_populate_kernel(&init_mm, pmdp, ptep);
106         }
107         ptep = pte_offset_kernel(pmdp, ea);
108 
109 set_the_pte:
110         set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
111         smp_wmb();
112         return 0;
113 }
114 
115 /*
116  * nid, region_start, and region_end are hints to try to place the page
117  * table memory in the same node or region.
118  */
119 static int __map_kernel_page(unsigned long ea, unsigned long pa,
120                           pgprot_t flags,
121                           unsigned int map_page_size,
122                           int nid,
123                           unsigned long region_start, unsigned long region_end)
124 {
125         unsigned long pfn = pa >> PAGE_SHIFT;
126         pgd_t *pgdp;
127         pud_t *pudp;
128         pmd_t *pmdp;
129         pte_t *ptep;
130         /*
131          * Make sure task size is correct as per the max adddr
132          */
133         BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
134 
135 #ifdef CONFIG_PPC_64K_PAGES
136         BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
137 #endif
138 
139         if (unlikely(!slab_is_available()))
140                 return early_map_kernel_page(ea, pa, flags, map_page_size,
141                                                 nid, region_start, region_end);
142 
143         /*
144          * Should make page table allocation functions be able to take a
145          * node, so we can place kernel page tables on the right nodes after
146          * boot.
147          */
148         pgdp = pgd_offset_k(ea);
149         pudp = pud_alloc(&init_mm, pgdp, ea);
150         if (!pudp)
151                 return -ENOMEM;
152         if (map_page_size == PUD_SIZE) {
153                 ptep = (pte_t *)pudp;
154                 goto set_the_pte;
155         }
156         pmdp = pmd_alloc(&init_mm, pudp, ea);
157         if (!pmdp)
158                 return -ENOMEM;
159         if (map_page_size == PMD_SIZE) {
160                 ptep = pmdp_ptep(pmdp);
161                 goto set_the_pte;
162         }
163         ptep = pte_alloc_kernel(pmdp, ea);
164         if (!ptep)
165                 return -ENOMEM;
166 
167 set_the_pte:
168         set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
169         smp_wmb();
170         return 0;
171 }
172 
173 int radix__map_kernel_page(unsigned long ea, unsigned long pa,
174                           pgprot_t flags,
175                           unsigned int map_page_size)
176 {
177         return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
178 }
179 
180 #ifdef CONFIG_STRICT_KERNEL_RWX
181 void radix__change_memory_range(unsigned long start, unsigned long end,
182                                 unsigned long clear)
183 {
184         unsigned long idx;
185         pgd_t *pgdp;
186         pud_t *pudp;
187         pmd_t *pmdp;
188         pte_t *ptep;
189 
190         start = ALIGN_DOWN(start, PAGE_SIZE);
191         end = PAGE_ALIGN(end); // aligns up
192 
193         pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
194                  start, end, clear);
195 
196         for (idx = start; idx < end; idx += PAGE_SIZE) {
197                 pgdp = pgd_offset_k(idx);
198                 pudp = pud_alloc(&init_mm, pgdp, idx);
199                 if (!pudp)
200                         continue;
201                 if (pud_huge(*pudp)) {
202                         ptep = (pte_t *)pudp;
203                         goto update_the_pte;
204                 }
205                 pmdp = pmd_alloc(&init_mm, pudp, idx);
206                 if (!pmdp)
207                         continue;
208                 if (pmd_huge(*pmdp)) {
209                         ptep = pmdp_ptep(pmdp);
210                         goto update_the_pte;
211                 }
212                 ptep = pte_alloc_kernel(pmdp, idx);
213                 if (!ptep)
214                         continue;
215 update_the_pte:
216                 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
217         }
218 
219         radix__flush_tlb_kernel_range(start, end);
220 }
221 
222 void radix__mark_rodata_ro(void)
223 {
224         unsigned long start, end;
225 
226         start = (unsigned long)_stext;
227         end = (unsigned long)__init_begin;
228 
229         radix__change_memory_range(start, end, _PAGE_WRITE);
230 }
231 
232 void radix__mark_initmem_nx(void)
233 {
234         unsigned long start = (unsigned long)__init_begin;
235         unsigned long end = (unsigned long)__init_end;
236 
237         radix__change_memory_range(start, end, _PAGE_EXEC);
238 }
239 #endif /* CONFIG_STRICT_KERNEL_RWX */
240 
241 static inline void __meminit
242 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
243 {
244         char buf[10];
245 
246         if (end <= start)
247                 return;
248 
249         string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
250 
251         pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
252                 exec ? " (exec)" : "");
253 }
254 
255 static unsigned long next_boundary(unsigned long addr, unsigned long end)
256 {
257 #ifdef CONFIG_STRICT_KERNEL_RWX
258         if (addr < __pa_symbol(__init_begin))
259                 return __pa_symbol(__init_begin);
260 #endif
261         return end;
262 }
263 
264 static int __meminit create_physical_mapping(unsigned long start,
265                                              unsigned long end,
266                                              int nid)
267 {
268         unsigned long vaddr, addr, mapping_size = 0;
269         bool prev_exec, exec = false;
270         pgprot_t prot;
271         int psize;
272 
273         start = _ALIGN_UP(start, PAGE_SIZE);
274         for (addr = start; addr < end; addr += mapping_size) {
275                 unsigned long gap, previous_size;
276                 int rc;
277 
278                 gap = next_boundary(addr, end) - addr;
279                 previous_size = mapping_size;
280                 prev_exec = exec;
281 
282                 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
283                     mmu_psize_defs[MMU_PAGE_1G].shift) {
284                         mapping_size = PUD_SIZE;
285                         psize = MMU_PAGE_1G;
286                 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
287                            mmu_psize_defs[MMU_PAGE_2M].shift) {
288                         mapping_size = PMD_SIZE;
289                         psize = MMU_PAGE_2M;
290                 } else {
291                         mapping_size = PAGE_SIZE;
292                         psize = mmu_virtual_psize;
293                 }
294 
295                 vaddr = (unsigned long)__va(addr);
296 
297                 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
298                     overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
299                         prot = PAGE_KERNEL_X;
300                         exec = true;
301                 } else {
302                         prot = PAGE_KERNEL;
303                         exec = false;
304                 }
305 
306                 if (mapping_size != previous_size || exec != prev_exec) {
307                         print_mapping(start, addr, previous_size, prev_exec);
308                         start = addr;
309                 }
310 
311                 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
312                 if (rc)
313                         return rc;
314 
315                 update_page_count(psize, 1);
316         }
317 
318         print_mapping(start, addr, mapping_size, exec);
319         return 0;
320 }
321 
322 void __init radix_init_pgtable(void)
323 {
324         unsigned long rts_field;
325         struct memblock_region *reg;
326 
327         /* We don't support slb for radix */
328         mmu_slb_size = 0;
329         /*
330          * Create the linear mapping, using standard page size for now
331          */
332         for_each_memblock(memory, reg) {
333                 /*
334                  * The memblock allocator  is up at this point, so the
335                  * page tables will be allocated within the range. No
336                  * need or a node (which we don't have yet).
337                  */
338 
339                 if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
340                         pr_warn("Outside the supported range\n");
341                         continue;
342                 }
343 
344                 WARN_ON(create_physical_mapping(reg->base,
345                                                 reg->base + reg->size,
346                                                 -1));
347         }
348 
349         /* Find out how many PID bits are supported */
350         if (cpu_has_feature(CPU_FTR_HVMODE)) {
351                 if (!mmu_pid_bits)
352                         mmu_pid_bits = 20;
353 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
354                 /*
355                  * When KVM is possible, we only use the top half of the
356                  * PID space to avoid collisions between host and guest PIDs
357                  * which can cause problems due to prefetch when exiting the
358                  * guest with AIL=3
359                  */
360                 mmu_base_pid = 1 << (mmu_pid_bits - 1);
361 #else
362                 mmu_base_pid = 1;
363 #endif
364         } else {
365                 /* The guest uses the bottom half of the PID space */
366                 if (!mmu_pid_bits)
367                         mmu_pid_bits = 19;
368                 mmu_base_pid = 1;
369         }
370 
371         /*
372          * Allocate Partition table and process table for the
373          * host.
374          */
375         BUG_ON(PRTB_SIZE_SHIFT > 36);
376         process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
377         /*
378          * Fill in the process table.
379          */
380         rts_field = radix__get_tree_size();
381         process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
382         /*
383          * Fill in the partition table. We are suppose to use effective address
384          * of process table here. But our linear mapping also enable us to use
385          * physical address here.
386          */
387         register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
388         pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
389         asm volatile("ptesync" : : : "memory");
390         asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
391                      "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
392         asm volatile("eieio; tlbsync; ptesync" : : : "memory");
393         trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
394 
395         /*
396          * The init_mm context is given the first available (non-zero) PID,
397          * which is the "guard PID" and contains no page table. PIDR should
398          * never be set to zero because that duplicates the kernel address
399          * space at the 0x0... offset (quadrant 0)!
400          *
401          * An arbitrary PID that may later be allocated by the PID allocator
402          * for userspace processes must not be used either, because that
403          * would cause stale user mappings for that PID on CPUs outside of
404          * the TLB invalidation scheme (because it won't be in mm_cpumask).
405          *
406          * So permanently carve out one PID for the purpose of a guard PID.
407          */
408         init_mm.context.id = mmu_base_pid;
409         mmu_base_pid++;
410 }
411 
412 static void __init radix_init_partition_table(void)
413 {
414         unsigned long rts_field, dw0;
415 
416         mmu_partition_table_init();
417         rts_field = radix__get_tree_size();
418         dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
419         mmu_partition_table_set_entry(0, dw0, 0);
420 
421         pr_info("Initializing Radix MMU\n");
422         pr_info("Partition table %p\n", partition_tb);
423 }
424 
425 void __init radix_init_native(void)
426 {
427         register_process_table = native_register_process_table;
428 }
429 
430 static int __init get_idx_from_shift(unsigned int shift)
431 {
432         int idx = -1;
433 
434         switch (shift) {
435         case 0xc:
436                 idx = MMU_PAGE_4K;
437                 break;
438         case 0x10:
439                 idx = MMU_PAGE_64K;
440                 break;
441         case 0x15:
442                 idx = MMU_PAGE_2M;
443                 break;
444         case 0x1e:
445                 idx = MMU_PAGE_1G;
446                 break;
447         }
448         return idx;
449 }
450 
451 static int __init radix_dt_scan_page_sizes(unsigned long node,
452                                            const char *uname, int depth,
453                                            void *data)
454 {
455         int size = 0;
456         int shift, idx;
457         unsigned int ap;
458         const __be32 *prop;
459         const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
460 
461         /* We are scanning "cpu" nodes only */
462         if (type == NULL || strcmp(type, "cpu") != 0)
463                 return 0;
464 
465         /* Find MMU PID size */
466         prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
467         if (prop && size == 4)
468                 mmu_pid_bits = be32_to_cpup(prop);
469 
470         /* Grab page size encodings */
471         prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
472         if (!prop)
473                 return 0;
474 
475         pr_info("Page sizes from device-tree:\n");
476         for (; size >= 4; size -= 4, ++prop) {
477 
478                 struct mmu_psize_def *def;
479 
480                 /* top 3 bit is AP encoding */
481                 shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
482                 ap = be32_to_cpu(prop[0]) >> 29;
483                 pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
484 
485                 idx = get_idx_from_shift(shift);
486                 if (idx < 0)
487                         continue;
488 
489                 def = &mmu_psize_defs[idx];
490                 def->shift = shift;
491                 def->ap  = ap;
492         }
493 
494         /* needed ? */
495         cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
496         return 1;
497 }
498 
499 void __init radix__early_init_devtree(void)
500 {
501         int rc;
502 
503         /*
504          * Try to find the available page sizes in the device-tree
505          */
506         rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
507         if (rc != 0)  /* Found */
508                 goto found;
509         /*
510          * let's assume we have page 4k and 64k support
511          */
512         mmu_psize_defs[MMU_PAGE_4K].shift = 12;
513         mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
514 
515         mmu_psize_defs[MMU_PAGE_64K].shift = 16;
516         mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
517 found:
518         return;
519 }
520 
521 static void radix_init_amor(void)
522 {
523         /*
524         * In HV mode, we init AMOR (Authority Mask Override Register) so that
525         * the hypervisor and guest can setup IAMR (Instruction Authority Mask
526         * Register), enable key 0 and set it to 1.
527         *
528         * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
529         */
530         mtspr(SPRN_AMOR, (3ul << 62));
531 }
532 
533 #ifdef CONFIG_PPC_KUEP
534 void setup_kuep(bool disabled)
535 {
536         if (disabled || !early_radix_enabled())
537                 return;
538 
539         if (smp_processor_id() == boot_cpuid)
540                 pr_info("Activating Kernel Userspace Execution Prevention\n");
541 
542         /*
543          * Radix always uses key0 of the IAMR to determine if an access is
544          * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
545          * fetch.
546          */
547         mtspr(SPRN_IAMR, (1ul << 62));
548 }
549 #endif
550 
551 #ifdef CONFIG_PPC_KUAP
552 void setup_kuap(bool disabled)
553 {
554         if (disabled || !early_radix_enabled())
555                 return;
556 
557         if (smp_processor_id() == boot_cpuid) {
558                 pr_info("Activating Kernel Userspace Access Prevention\n");
559                 cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
560         }
561 
562         /* Make sure userspace can't change the AMR */
563         mtspr(SPRN_UAMOR, 0);
564         mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
565         isync();
566 }
567 #endif
568 
569 void __init radix__early_init_mmu(void)
570 {
571         unsigned long lpcr;
572 
573 #ifdef CONFIG_PPC_64K_PAGES
574         /* PAGE_SIZE mappings */
575         mmu_virtual_psize = MMU_PAGE_64K;
576 #else
577         mmu_virtual_psize = MMU_PAGE_4K;
578 #endif
579 
580 #ifdef CONFIG_SPARSEMEM_VMEMMAP
581         /* vmemmap mapping */
582         if (mmu_psize_defs[MMU_PAGE_2M].shift) {
583                 /*
584                  * map vmemmap using 2M if available
585                  */
586                 mmu_vmemmap_psize = MMU_PAGE_2M;
587         } else
588                 mmu_vmemmap_psize = mmu_virtual_psize;
589 #endif
590         /*
591          * initialize page table size
592          */
593         __pte_index_size = RADIX_PTE_INDEX_SIZE;
594         __pmd_index_size = RADIX_PMD_INDEX_SIZE;
595         __pud_index_size = RADIX_PUD_INDEX_SIZE;
596         __pgd_index_size = RADIX_PGD_INDEX_SIZE;
597         __pud_cache_index = RADIX_PUD_INDEX_SIZE;
598         __pte_table_size = RADIX_PTE_TABLE_SIZE;
599         __pmd_table_size = RADIX_PMD_TABLE_SIZE;
600         __pud_table_size = RADIX_PUD_TABLE_SIZE;
601         __pgd_table_size = RADIX_PGD_TABLE_SIZE;
602 
603         __pmd_val_bits = RADIX_PMD_VAL_BITS;
604         __pud_val_bits = RADIX_PUD_VAL_BITS;
605         __pgd_val_bits = RADIX_PGD_VAL_BITS;
606 
607         __kernel_virt_start = RADIX_KERN_VIRT_START;
608         __vmalloc_start = RADIX_VMALLOC_START;
609         __vmalloc_end = RADIX_VMALLOC_END;
610         __kernel_io_start = RADIX_KERN_IO_START;
611         __kernel_io_end = RADIX_KERN_IO_END;
612         vmemmap = (struct page *)RADIX_VMEMMAP_START;
613         ioremap_bot = IOREMAP_BASE;
614 
615 #ifdef CONFIG_PCI
616         pci_io_base = ISA_IO_BASE;
617 #endif
618         __pte_frag_nr = RADIX_PTE_FRAG_NR;
619         __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
620         __pmd_frag_nr = RADIX_PMD_FRAG_NR;
621         __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
622 
623         if (!firmware_has_feature(FW_FEATURE_LPAR)) {
624                 radix_init_native();
625                 lpcr = mfspr(SPRN_LPCR);
626                 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
627                 radix_init_partition_table();
628                 radix_init_amor();
629         } else {
630                 radix_init_pseries();
631         }
632 
633         memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
634 
635         radix_init_pgtable();
636         /* Switch to the guard PID before turning on MMU */
637         radix__switch_mmu_context(NULL, &init_mm);
638         if (cpu_has_feature(CPU_FTR_HVMODE))
639                 tlbiel_all();
640 }
641 
642 void radix__early_init_mmu_secondary(void)
643 {
644         unsigned long lpcr;
645         /*
646          * update partition table control register and UPRT
647          */
648         if (!firmware_has_feature(FW_FEATURE_LPAR)) {
649                 lpcr = mfspr(SPRN_LPCR);
650                 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
651 
652                 mtspr(SPRN_PTCR,
653                       __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
654                 radix_init_amor();
655         }
656 
657         radix__switch_mmu_context(NULL, &init_mm);
658         if (cpu_has_feature(CPU_FTR_HVMODE))
659                 tlbiel_all();
660 }
661 
662 void radix__mmu_cleanup_all(void)
663 {
664         unsigned long lpcr;
665 
666         if (!firmware_has_feature(FW_FEATURE_LPAR)) {
667                 lpcr = mfspr(SPRN_LPCR);
668                 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
669                 mtspr(SPRN_PTCR, 0);
670                 powernv_set_nmmu_ptcr(0);
671                 radix__flush_tlb_all();
672         }
673 }
674 
675 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
676                                 phys_addr_t first_memblock_size)
677 {
678         /*
679          * We don't currently support the first MEMBLOCK not mapping 0
680          * physical on those processors
681          */
682         BUG_ON(first_memblock_base != 0);
683 
684         /*
685          * Radix mode is not limited by RMA / VRMA addressing.
686          */
687         ppc64_rma_size = ULONG_MAX;
688 }
689 
690 #ifdef CONFIG_MEMORY_HOTPLUG
691 static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
692 {
693         pte_t *pte;
694         int i;
695 
696         for (i = 0; i < PTRS_PER_PTE; i++) {
697                 pte = pte_start + i;
698                 if (!pte_none(*pte))
699                         return;
700         }
701 
702         pte_free_kernel(&init_mm, pte_start);
703         pmd_clear(pmd);
704 }
705 
706 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
707 {
708         pmd_t *pmd;
709         int i;
710 
711         for (i = 0; i < PTRS_PER_PMD; i++) {
712                 pmd = pmd_start + i;
713                 if (!pmd_none(*pmd))
714                         return;
715         }
716 
717         pmd_free(&init_mm, pmd_start);
718         pud_clear(pud);
719 }
720 
721 struct change_mapping_params {
722         pte_t *pte;
723         unsigned long start;
724         unsigned long end;
725         unsigned long aligned_start;
726         unsigned long aligned_end;
727 };
728 
729 static int __meminit stop_machine_change_mapping(void *data)
730 {
731         struct change_mapping_params *params =
732                         (struct change_mapping_params *)data;
733 
734         if (!data)
735                 return -1;
736 
737         spin_unlock(&init_mm.page_table_lock);
738         pte_clear(&init_mm, params->aligned_start, params->pte);
739         create_physical_mapping(params->aligned_start, params->start, -1);
740         create_physical_mapping(params->end, params->aligned_end, -1);
741         spin_lock(&init_mm.page_table_lock);
742         return 0;
743 }
744 
745 static void remove_pte_table(pte_t *pte_start, unsigned long addr,
746                              unsigned long end)
747 {
748         unsigned long next;
749         pte_t *pte;
750 
751         pte = pte_start + pte_index(addr);
752         for (; addr < end; addr = next, pte++) {
753                 next = (addr + PAGE_SIZE) & PAGE_MASK;
754                 if (next > end)
755                         next = end;
756 
757                 if (!pte_present(*pte))
758                         continue;
759 
760                 if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
761                         /*
762                          * The vmemmap_free() and remove_section_mapping()
763                          * codepaths call us with aligned addresses.
764                          */
765                         WARN_ONCE(1, "%s: unaligned range\n", __func__);
766                         continue;
767                 }
768 
769                 pte_clear(&init_mm, addr, pte);
770         }
771 }
772 
773 /*
774  * clear the pte and potentially split the mapping helper
775  */
776 static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
777                                 unsigned long size, pte_t *pte)
778 {
779         unsigned long mask = ~(size - 1);
780         unsigned long aligned_start = addr & mask;
781         unsigned long aligned_end = addr + size;
782         struct change_mapping_params params;
783         bool split_region = false;
784 
785         if ((end - addr) < size) {
786                 /*
787                  * We're going to clear the PTE, but not flushed
788                  * the mapping, time to remap and flush. The
789                  * effects if visible outside the processor or
790                  * if we are running in code close to the
791                  * mapping we cleared, we are in trouble.
792                  */
793                 if (overlaps_kernel_text(aligned_start, addr) ||
794                         overlaps_kernel_text(end, aligned_end)) {
795                         /*
796                          * Hack, just return, don't pte_clear
797                          */
798                         WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
799                                   "text, not splitting\n", addr, end);
800                         return;
801                 }
802                 split_region = true;
803         }
804 
805         if (split_region) {
806                 params.pte = pte;
807                 params.start = addr;
808                 params.end = end;
809                 params.aligned_start = addr & ~(size - 1);
810                 params.aligned_end = min_t(unsigned long, aligned_end,
811                                 (unsigned long)__va(memblock_end_of_DRAM()));
812                 stop_machine(stop_machine_change_mapping, &params, NULL);
813                 return;
814         }
815 
816         pte_clear(&init_mm, addr, pte);
817 }
818 
819 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
820                              unsigned long end)
821 {
822         unsigned long next;
823         pte_t *pte_base;
824         pmd_t *pmd;
825 
826         pmd = pmd_start + pmd_index(addr);
827         for (; addr < end; addr = next, pmd++) {
828                 next = pmd_addr_end(addr, end);
829 
830                 if (!pmd_present(*pmd))
831                         continue;
832 
833                 if (pmd_huge(*pmd)) {
834                         split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
835                         continue;
836                 }
837 
838                 pte_base = (pte_t *)pmd_page_vaddr(*pmd);
839                 remove_pte_table(pte_base, addr, next);
840                 free_pte_table(pte_base, pmd);
841         }
842 }
843 
844 static void remove_pud_table(pud_t *pud_start, unsigned long addr,
845                              unsigned long end)
846 {
847         unsigned long next;
848         pmd_t *pmd_base;
849         pud_t *pud;
850 
851         pud = pud_start + pud_index(addr);
852         for (; addr < end; addr = next, pud++) {
853                 next = pud_addr_end(addr, end);
854 
855                 if (!pud_present(*pud))
856                         continue;
857 
858                 if (pud_huge(*pud)) {
859                         split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
860                         continue;
861                 }
862 
863                 pmd_base = (pmd_t *)pud_page_vaddr(*pud);
864                 remove_pmd_table(pmd_base, addr, next);
865                 free_pmd_table(pmd_base, pud);
866         }
867 }
868 
869 static void __meminit remove_pagetable(unsigned long start, unsigned long end)
870 {
871         unsigned long addr, next;
872         pud_t *pud_base;
873         pgd_t *pgd;
874 
875         spin_lock(&init_mm.page_table_lock);
876 
877         for (addr = start; addr < end; addr = next) {
878                 next = pgd_addr_end(addr, end);
879 
880                 pgd = pgd_offset_k(addr);
881                 if (!pgd_present(*pgd))
882                         continue;
883 
884                 if (pgd_huge(*pgd)) {
885                         split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
886                         continue;
887                 }
888 
889                 pud_base = (pud_t *)pgd_page_vaddr(*pgd);
890                 remove_pud_table(pud_base, addr, next);
891         }
892 
893         spin_unlock(&init_mm.page_table_lock);
894         radix__flush_tlb_kernel_range(start, end);
895 }
896 
897 int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
898 {
899         if (end >= RADIX_VMALLOC_START) {
900                 pr_warn("Outside the supported range\n");
901                 return -1;
902         }
903 
904         return create_physical_mapping(__pa(start), __pa(end), nid);
905 }
906 
907 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
908 {
909         remove_pagetable(start, end);
910         return 0;
911 }
912 #endif /* CONFIG_MEMORY_HOTPLUG */
913 
914 #ifdef CONFIG_SPARSEMEM_VMEMMAP
915 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
916                                  pgprot_t flags, unsigned int map_page_size,
917                                  int nid)
918 {
919         return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
920 }
921 
922 int __meminit radix__vmemmap_create_mapping(unsigned long start,
923                                       unsigned long page_size,
924                                       unsigned long phys)
925 {
926         /* Create a PTE encoding */
927         unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
928         int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
929         int ret;
930 
931         if ((start + page_size) >= RADIX_VMEMMAP_END) {
932                 pr_warn("Outside the supported range\n");
933                 return -1;
934         }
935 
936         ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
937         BUG_ON(ret);
938 
939         return 0;
940 }
941 
942 #ifdef CONFIG_MEMORY_HOTPLUG
943 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
944 {
945         remove_pagetable(start, start + page_size);
946 }
947 #endif
948 #endif
949 
950 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
951 
952 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
953                                   pmd_t *pmdp, unsigned long clr,
954                                   unsigned long set)
955 {
956         unsigned long old;
957 
958 #ifdef CONFIG_DEBUG_VM
959         WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
960         assert_spin_locked(pmd_lockptr(mm, pmdp));
961 #endif
962 
963         old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
964         trace_hugepage_update(addr, old, clr, set);
965 
966         return old;
967 }
968 
969 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
970                         pmd_t *pmdp)
971 
972 {
973         pmd_t pmd;
974 
975         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
976         VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
977         VM_BUG_ON(pmd_devmap(*pmdp));
978         /*
979          * khugepaged calls this for normal pmd
980          */
981         pmd = *pmdp;
982         pmd_clear(pmdp);
983 
984         /*FIXME!!  Verify whether we need this kick below */
985         serialize_against_pte_lookup(vma->vm_mm);
986 
987         radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
988 
989         return pmd;
990 }
991 
992 /*
993  * For us pgtable_t is pte_t *. Inorder to save the deposisted
994  * page table, we consider the allocated page table as a list
995  * head. On withdraw we need to make sure we zero out the used
996  * list_head memory area.
997  */
998 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
999                                  pgtable_t pgtable)
1000 {
1001         struct list_head *lh = (struct list_head *) pgtable;
1002 
1003         assert_spin_locked(pmd_lockptr(mm, pmdp));
1004 
1005         /* FIFO */
1006         if (!pmd_huge_pte(mm, pmdp))
1007                 INIT_LIST_HEAD(lh);
1008         else
1009                 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1010         pmd_huge_pte(mm, pmdp) = pgtable;
1011 }
1012 
1013 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1014 {
1015         pte_t *ptep;
1016         pgtable_t pgtable;
1017         struct list_head *lh;
1018 
1019         assert_spin_locked(pmd_lockptr(mm, pmdp));
1020 
1021         /* FIFO */
1022         pgtable = pmd_huge_pte(mm, pmdp);
1023         lh = (struct list_head *) pgtable;
1024         if (list_empty(lh))
1025                 pmd_huge_pte(mm, pmdp) = NULL;
1026         else {
1027                 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1028                 list_del(lh);
1029         }
1030         ptep = (pte_t *) pgtable;
1031         *ptep = __pte(0);
1032         ptep++;
1033         *ptep = __pte(0);
1034         return pgtable;
1035 }
1036 
1037 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
1038                                      unsigned long addr, pmd_t *pmdp)
1039 {
1040         pmd_t old_pmd;
1041         unsigned long old;
1042 
1043         old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
1044         old_pmd = __pmd(old);
1045         /*
1046          * Serialize against find_current_mm_pte which does lock-less
1047          * lookup in page tables with local interrupts disabled. For huge pages
1048          * it casts pmd_t to pte_t. Since format of pte_t is different from
1049          * pmd_t we want to prevent transit from pmd pointing to page table
1050          * to pmd pointing to huge page (and back) while interrupts are disabled.
1051          * We clear pmd to possibly replace it with page table pointer in
1052          * different code paths. So make sure we wait for the parallel
1053          * find_current_mm_pte to finish.
1054          */
1055         serialize_against_pte_lookup(mm);
1056         return old_pmd;
1057 }
1058 
1059 int radix__has_transparent_hugepage(void)
1060 {
1061         /* For radix 2M at PMD level means thp */
1062         if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
1063                 return 1;
1064         return 0;
1065 }
1066 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1067 
1068 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
1069                                   pte_t entry, unsigned long address, int psize)
1070 {
1071         struct mm_struct *mm = vma->vm_mm;
1072         unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
1073                                               _PAGE_RW | _PAGE_EXEC);
1074 
1075         unsigned long change = pte_val(entry) ^ pte_val(*ptep);
1076         /*
1077          * To avoid NMMU hang while relaxing access, we need mark
1078          * the pte invalid in between.
1079          */
1080         if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
1081                 unsigned long old_pte, new_pte;
1082 
1083                 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
1084                 /*
1085                  * new value of pte
1086                  */
1087                 new_pte = old_pte | set;
1088                 radix__flush_tlb_page_psize(mm, address, psize);
1089                 __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
1090         } else {
1091                 __radix_pte_update(ptep, 0, set);
1092                 /*
1093                  * Book3S does not require a TLB flush when relaxing access
1094                  * restrictions when the address space is not attached to a
1095                  * NMMU, because the core MMU will reload the pte after taking
1096                  * an access fault, which is defined by the architectue.
1097                  */
1098         }
1099         /* See ptesync comment in radix__set_pte_at */
1100 }
1101 
1102 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
1103                                     unsigned long addr, pte_t *ptep,
1104                                     pte_t old_pte, pte_t pte)
1105 {
1106         struct mm_struct *mm = vma->vm_mm;
1107 
1108         /*
1109          * To avoid NMMU hang while relaxing access we need to flush the tlb before
1110          * we set the new value. We need to do this only for radix, because hash
1111          * translation does flush when updating the linux pte.
1112          */
1113         if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
1114             (atomic_read(&mm->context.copros) > 0))
1115                 radix__flush_tlb_page(vma, addr);
1116 
1117         set_pte_at(mm, addr, ptep, pte);
1118 }
1119 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp