~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/sparc/mm/init_64.c

Version: ~ [ linux-5.14-rc3 ] ~ [ linux-5.13.5 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.53 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.135 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.198 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.240 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.276 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.276 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.18.140 ] ~ [ linux-3.16.85 ] ~ [ linux-3.14.79 ] ~ [ linux-3.12.74 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *  arch/sparc64/mm/init.c
  3  *
  4  *  Copyright (C) 1996-1999 David S. Miller (davem@caip.rutgers.edu)
  5  *  Copyright (C) 1997-1999 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
  6  */
  7  
  8 #include <linux/module.h>
  9 #include <linux/kernel.h>
 10 #include <linux/sched.h>
 11 #include <linux/string.h>
 12 #include <linux/init.h>
 13 #include <linux/bootmem.h>
 14 #include <linux/mm.h>
 15 #include <linux/hugetlb.h>
 16 #include <linux/initrd.h>
 17 #include <linux/swap.h>
 18 #include <linux/pagemap.h>
 19 #include <linux/poison.h>
 20 #include <linux/fs.h>
 21 #include <linux/seq_file.h>
 22 #include <linux/kprobes.h>
 23 #include <linux/cache.h>
 24 #include <linux/sort.h>
 25 #include <linux/percpu.h>
 26 #include <linux/memblock.h>
 27 #include <linux/mmzone.h>
 28 #include <linux/gfp.h>
 29 
 30 #include <asm/head.h>
 31 #include <asm/page.h>
 32 #include <asm/pgalloc.h>
 33 #include <asm/pgtable.h>
 34 #include <asm/oplib.h>
 35 #include <asm/iommu.h>
 36 #include <asm/io.h>
 37 #include <asm/uaccess.h>
 38 #include <asm/mmu_context.h>
 39 #include <asm/tlbflush.h>
 40 #include <asm/dma.h>
 41 #include <asm/starfire.h>
 42 #include <asm/tlb.h>
 43 #include <asm/spitfire.h>
 44 #include <asm/sections.h>
 45 #include <asm/tsb.h>
 46 #include <asm/hypervisor.h>
 47 #include <asm/prom.h>
 48 #include <asm/mdesc.h>
 49 #include <asm/cpudata.h>
 50 #include <asm/irq.h>
 51 
 52 #include "init_64.h"
 53 
 54 unsigned long kern_linear_pte_xor[4] __read_mostly;
 55 
 56 /* A bitmap, two bits for every 256MB of physical memory.  These two
 57  * bits determine what page size we use for kernel linear
 58  * translations.  They form an index into kern_linear_pte_xor[].  The
 59  * value in the indexed slot is XOR'd with the TLB miss virtual
 60  * address to form the resulting TTE.  The mapping is:
 61  *
 62  *      0       ==>     4MB
 63  *      1       ==>     256MB
 64  *      2       ==>     2GB
 65  *      3       ==>     16GB
 66  *
 67  * All sun4v chips support 256MB pages.  Only SPARC-T4 and later
 68  * support 2GB pages, and hopefully future cpus will support the 16GB
 69  * pages as well.  For slots 2 and 3, we encode a 256MB TTE xor there
 70  * if these larger page sizes are not supported by the cpu.
 71  *
 72  * It would be nice to determine this from the machine description
 73  * 'cpu' properties, but we need to have this table setup before the
 74  * MDESC is initialized.
 75  */
 76 
 77 #ifndef CONFIG_DEBUG_PAGEALLOC
 78 /* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings.
 79  * Space is allocated for this right after the trap table in
 80  * arch/sparc64/kernel/head.S
 81  */
 82 extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
 83 #endif
 84 extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
 85 
 86 static unsigned long cpu_pgsz_mask;
 87 
 88 #define MAX_BANKS       1024
 89 
 90 static struct linux_prom64_registers pavail[MAX_BANKS];
 91 static int pavail_ents;
 92 
 93 static int cmp_p64(const void *a, const void *b)
 94 {
 95         const struct linux_prom64_registers *x = a, *y = b;
 96 
 97         if (x->phys_addr > y->phys_addr)
 98                 return 1;
 99         if (x->phys_addr < y->phys_addr)
100                 return -1;
101         return 0;
102 }
103 
104 static void __init read_obp_memory(const char *property,
105                                    struct linux_prom64_registers *regs,
106                                    int *num_ents)
107 {
108         phandle node = prom_finddevice("/memory");
109         int prop_size = prom_getproplen(node, property);
110         int ents, ret, i;
111 
112         ents = prop_size / sizeof(struct linux_prom64_registers);
113         if (ents > MAX_BANKS) {
114                 prom_printf("The machine has more %s property entries than "
115                             "this kernel can support (%d).\n",
116                             property, MAX_BANKS);
117                 prom_halt();
118         }
119 
120         ret = prom_getproperty(node, property, (char *) regs, prop_size);
121         if (ret == -1) {
122                 prom_printf("Couldn't get %s property from /memory.\n",
123                                 property);
124                 prom_halt();
125         }
126 
127         /* Sanitize what we got from the firmware, by page aligning
128          * everything.
129          */
130         for (i = 0; i < ents; i++) {
131                 unsigned long base, size;
132 
133                 base = regs[i].phys_addr;
134                 size = regs[i].reg_size;
135 
136                 size &= PAGE_MASK;
137                 if (base & ~PAGE_MASK) {
138                         unsigned long new_base = PAGE_ALIGN(base);
139 
140                         size -= new_base - base;
141                         if ((long) size < 0L)
142                                 size = 0UL;
143                         base = new_base;
144                 }
145                 if (size == 0UL) {
146                         /* If it is empty, simply get rid of it.
147                          * This simplifies the logic of the other
148                          * functions that process these arrays.
149                          */
150                         memmove(&regs[i], &regs[i + 1],
151                                 (ents - i - 1) * sizeof(regs[0]));
152                         i--;
153                         ents--;
154                         continue;
155                 }
156                 regs[i].phys_addr = base;
157                 regs[i].reg_size = size;
158         }
159 
160         *num_ents = ents;
161 
162         sort(regs, ents, sizeof(struct linux_prom64_registers),
163              cmp_p64, NULL);
164 }
165 
166 /* Kernel physical address base and size in bytes.  */
167 unsigned long kern_base __read_mostly;
168 unsigned long kern_size __read_mostly;
169 
170 /* Initial ramdisk setup */
171 extern unsigned long sparc_ramdisk_image64;
172 extern unsigned int sparc_ramdisk_image;
173 extern unsigned int sparc_ramdisk_size;
174 
175 struct page *mem_map_zero __read_mostly;
176 EXPORT_SYMBOL(mem_map_zero);
177 
178 unsigned int sparc64_highest_unlocked_tlb_ent __read_mostly;
179 
180 unsigned long sparc64_kern_pri_context __read_mostly;
181 unsigned long sparc64_kern_pri_nuc_bits __read_mostly;
182 unsigned long sparc64_kern_sec_context __read_mostly;
183 
184 int num_kernel_image_mappings;
185 
186 #ifdef CONFIG_DEBUG_DCFLUSH
187 atomic_t dcpage_flushes = ATOMIC_INIT(0);
188 #ifdef CONFIG_SMP
189 atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0);
190 #endif
191 #endif
192 
193 inline void flush_dcache_page_impl(struct page *page)
194 {
195         BUG_ON(tlb_type == hypervisor);
196 #ifdef CONFIG_DEBUG_DCFLUSH
197         atomic_inc(&dcpage_flushes);
198 #endif
199 
200 #ifdef DCACHE_ALIASING_POSSIBLE
201         __flush_dcache_page(page_address(page),
202                             ((tlb_type == spitfire) &&
203                              page_mapping(page) != NULL));
204 #else
205         if (page_mapping(page) != NULL &&
206             tlb_type == spitfire)
207                 __flush_icache_page(__pa(page_address(page)));
208 #endif
209 }
210 
211 #define PG_dcache_dirty         PG_arch_1
212 #define PG_dcache_cpu_shift     32UL
213 #define PG_dcache_cpu_mask      \
214         ((1UL<<ilog2(roundup_pow_of_two(NR_CPUS)))-1UL)
215 
216 #define dcache_dirty_cpu(page) \
217         (((page)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask)
218 
219 static inline void set_dcache_dirty(struct page *page, int this_cpu)
220 {
221         unsigned long mask = this_cpu;
222         unsigned long non_cpu_bits;
223 
224         non_cpu_bits = ~(PG_dcache_cpu_mask << PG_dcache_cpu_shift);
225         mask = (mask << PG_dcache_cpu_shift) | (1UL << PG_dcache_dirty);
226 
227         __asm__ __volatile__("1:\n\t"
228                              "ldx       [%2], %%g7\n\t"
229                              "and       %%g7, %1, %%g1\n\t"
230                              "or        %%g1, %0, %%g1\n\t"
231                              "casx      [%2], %%g7, %%g1\n\t"
232                              "cmp       %%g7, %%g1\n\t"
233                              "bne,pn    %%xcc, 1b\n\t"
234                              " nop"
235                              : /* no outputs */
236                              : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags)
237                              : "g1", "g7");
238 }
239 
240 static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu)
241 {
242         unsigned long mask = (1UL << PG_dcache_dirty);
243 
244         __asm__ __volatile__("! test_and_clear_dcache_dirty\n"
245                              "1:\n\t"
246                              "ldx       [%2], %%g7\n\t"
247                              "srlx      %%g7, %4, %%g1\n\t"
248                              "and       %%g1, %3, %%g1\n\t"
249                              "cmp       %%g1, %0\n\t"
250                              "bne,pn    %%icc, 2f\n\t"
251                              " andn     %%g7, %1, %%g1\n\t"
252                              "casx      [%2], %%g7, %%g1\n\t"
253                              "cmp       %%g7, %%g1\n\t"
254                              "bne,pn    %%xcc, 1b\n\t"
255                              " nop\n"
256                              "2:"
257                              : /* no outputs */
258                              : "r" (cpu), "r" (mask), "r" (&page->flags),
259                                "i" (PG_dcache_cpu_mask),
260                                "i" (PG_dcache_cpu_shift)
261                              : "g1", "g7");
262 }
263 
264 static inline void tsb_insert(struct tsb *ent, unsigned long tag, unsigned long pte)
265 {
266         unsigned long tsb_addr = (unsigned long) ent;
267 
268         if (tlb_type == cheetah_plus || tlb_type == hypervisor)
269                 tsb_addr = __pa(tsb_addr);
270 
271         __tsb_insert(tsb_addr, tag, pte);
272 }
273 
274 unsigned long _PAGE_ALL_SZ_BITS __read_mostly;
275 
276 static void flush_dcache(unsigned long pfn)
277 {
278         struct page *page;
279 
280         page = pfn_to_page(pfn);
281         if (page) {
282                 unsigned long pg_flags;
283 
284                 pg_flags = page->flags;
285                 if (pg_flags & (1UL << PG_dcache_dirty)) {
286                         int cpu = ((pg_flags >> PG_dcache_cpu_shift) &
287                                    PG_dcache_cpu_mask);
288                         int this_cpu = get_cpu();
289 
290                         /* This is just to optimize away some function calls
291                          * in the SMP case.
292                          */
293                         if (cpu == this_cpu)
294                                 flush_dcache_page_impl(page);
295                         else
296                                 smp_flush_dcache_page_impl(page, cpu);
297 
298                         clear_dcache_dirty_cpu(page, cpu);
299 
300                         put_cpu();
301                 }
302         }
303 }
304 
305 /* mm->context.lock must be held */
306 static void __update_mmu_tsb_insert(struct mm_struct *mm, unsigned long tsb_index,
307                                     unsigned long tsb_hash_shift, unsigned long address,
308                                     unsigned long tte)
309 {
310         struct tsb *tsb = mm->context.tsb_block[tsb_index].tsb;
311         unsigned long tag;
312 
313         if (unlikely(!tsb))
314                 return;
315 
316         tsb += ((address >> tsb_hash_shift) &
317                 (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL));
318         tag = (address >> 22UL);
319         tsb_insert(tsb, tag, tte);
320 }
321 
322 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
323 static inline bool is_hugetlb_pte(pte_t pte)
324 {
325         if ((tlb_type == hypervisor &&
326              (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) ||
327             (tlb_type != hypervisor &&
328              (pte_val(pte) & _PAGE_SZALL_4U) == _PAGE_SZHUGE_4U))
329                 return true;
330         return false;
331 }
332 #endif
333 
334 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
335 {
336         struct mm_struct *mm;
337         unsigned long flags;
338         pte_t pte = *ptep;
339 
340         if (tlb_type != hypervisor) {
341                 unsigned long pfn = pte_pfn(pte);
342 
343                 if (pfn_valid(pfn))
344                         flush_dcache(pfn);
345         }
346 
347         mm = vma->vm_mm;
348 
349         /* Don't insert a non-valid PTE into the TSB, we'll deadlock.  */
350         if (!pte_accessible(mm, pte))
351                 return;
352 
353         spin_lock_irqsave(&mm->context.lock, flags);
354 
355 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
356         if ((mm->context.hugetlb_pte_count || mm->context.thp_pte_count) &&
357             is_hugetlb_pte(pte))
358                 __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
359                                         address, pte_val(pte));
360         else
361 #endif
362                 __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT,
363                                         address, pte_val(pte));
364 
365         spin_unlock_irqrestore(&mm->context.lock, flags);
366 }
367 
368 void flush_dcache_page(struct page *page)
369 {
370         struct address_space *mapping;
371         int this_cpu;
372 
373         if (tlb_type == hypervisor)
374                 return;
375 
376         /* Do not bother with the expensive D-cache flush if it
377          * is merely the zero page.  The 'bigcore' testcase in GDB
378          * causes this case to run millions of times.
379          */
380         if (page == ZERO_PAGE(0))
381                 return;
382 
383         this_cpu = get_cpu();
384 
385         mapping = page_mapping(page);
386         if (mapping && !mapping_mapped(mapping)) {
387                 int dirty = test_bit(PG_dcache_dirty, &page->flags);
388                 if (dirty) {
389                         int dirty_cpu = dcache_dirty_cpu(page);
390 
391                         if (dirty_cpu == this_cpu)
392                                 goto out;
393                         smp_flush_dcache_page_impl(page, dirty_cpu);
394                 }
395                 set_dcache_dirty(page, this_cpu);
396         } else {
397                 /* We could delay the flush for the !page_mapping
398                  * case too.  But that case is for exec env/arg
399                  * pages and those are %99 certainly going to get
400                  * faulted into the tlb (and thus flushed) anyways.
401                  */
402                 flush_dcache_page_impl(page);
403         }
404 
405 out:
406         put_cpu();
407 }
408 EXPORT_SYMBOL(flush_dcache_page);
409 
410 void __kprobes flush_icache_range(unsigned long start, unsigned long end)
411 {
412         /* Cheetah and Hypervisor platform cpus have coherent I-cache. */
413         if (tlb_type == spitfire) {
414                 unsigned long kaddr;
415 
416                 /* This code only runs on Spitfire cpus so this is
417                  * why we can assume _PAGE_PADDR_4U.
418                  */
419                 for (kaddr = start; kaddr < end; kaddr += PAGE_SIZE) {
420                         unsigned long paddr, mask = _PAGE_PADDR_4U;
421 
422                         if (kaddr >= PAGE_OFFSET)
423                                 paddr = kaddr & mask;
424                         else {
425                                 pgd_t *pgdp = pgd_offset_k(kaddr);
426                                 pud_t *pudp = pud_offset(pgdp, kaddr);
427                                 pmd_t *pmdp = pmd_offset(pudp, kaddr);
428                                 pte_t *ptep = pte_offset_kernel(pmdp, kaddr);
429 
430                                 paddr = pte_val(*ptep) & mask;
431                         }
432                         __flush_icache_page(paddr);
433                 }
434         }
435 }
436 EXPORT_SYMBOL(flush_icache_range);
437 
438 void mmu_info(struct seq_file *m)
439 {
440         static const char *pgsz_strings[] = {
441                 "8K", "64K", "512K", "4MB", "32MB",
442                 "256MB", "2GB", "16GB",
443         };
444         int i, printed;
445 
446         if (tlb_type == cheetah)
447                 seq_printf(m, "MMU Type\t: Cheetah\n");
448         else if (tlb_type == cheetah_plus)
449                 seq_printf(m, "MMU Type\t: Cheetah+\n");
450         else if (tlb_type == spitfire)
451                 seq_printf(m, "MMU Type\t: Spitfire\n");
452         else if (tlb_type == hypervisor)
453                 seq_printf(m, "MMU Type\t: Hypervisor (sun4v)\n");
454         else
455                 seq_printf(m, "MMU Type\t: ???\n");
456 
457         seq_printf(m, "MMU PGSZs\t: ");
458         printed = 0;
459         for (i = 0; i < ARRAY_SIZE(pgsz_strings); i++) {
460                 if (cpu_pgsz_mask & (1UL << i)) {
461                         seq_printf(m, "%s%s",
462                                    printed ? "," : "", pgsz_strings[i]);
463                         printed++;
464                 }
465         }
466         seq_putc(m, '\n');
467 
468 #ifdef CONFIG_DEBUG_DCFLUSH
469         seq_printf(m, "DCPageFlushes\t: %d\n",
470                    atomic_read(&dcpage_flushes));
471 #ifdef CONFIG_SMP
472         seq_printf(m, "DCPageFlushesXC\t: %d\n",
473                    atomic_read(&dcpage_flushes_xcall));
474 #endif /* CONFIG_SMP */
475 #endif /* CONFIG_DEBUG_DCFLUSH */
476 }
477 
478 struct linux_prom_translation prom_trans[512] __read_mostly;
479 unsigned int prom_trans_ents __read_mostly;
480 
481 unsigned long kern_locked_tte_data;
482 
483 /* The obp translations are saved based on 8k pagesize, since obp can
484  * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS ->
485  * HI_OBP_ADDRESS range are handled in ktlb.S.
486  */
487 static inline int in_obp_range(unsigned long vaddr)
488 {
489         return (vaddr >= LOW_OBP_ADDRESS &&
490                 vaddr < HI_OBP_ADDRESS);
491 }
492 
493 static int cmp_ptrans(const void *a, const void *b)
494 {
495         const struct linux_prom_translation *x = a, *y = b;
496 
497         if (x->virt > y->virt)
498                 return 1;
499         if (x->virt < y->virt)
500                 return -1;
501         return 0;
502 }
503 
504 /* Read OBP translations property into 'prom_trans[]'.  */
505 static void __init read_obp_translations(void)
506 {
507         int n, node, ents, first, last, i;
508 
509         node = prom_finddevice("/virtual-memory");
510         n = prom_getproplen(node, "translations");
511         if (unlikely(n == 0 || n == -1)) {
512                 prom_printf("prom_mappings: Couldn't get size.\n");
513                 prom_halt();
514         }
515         if (unlikely(n > sizeof(prom_trans))) {
516                 prom_printf("prom_mappings: Size %d is too big.\n", n);
517                 prom_halt();
518         }
519 
520         if ((n = prom_getproperty(node, "translations",
521                                   (char *)&prom_trans[0],
522                                   sizeof(prom_trans))) == -1) {
523                 prom_printf("prom_mappings: Couldn't get property.\n");
524                 prom_halt();
525         }
526 
527         n = n / sizeof(struct linux_prom_translation);
528 
529         ents = n;
530 
531         sort(prom_trans, ents, sizeof(struct linux_prom_translation),
532              cmp_ptrans, NULL);
533 
534         /* Now kick out all the non-OBP entries.  */
535         for (i = 0; i < ents; i++) {
536                 if (in_obp_range(prom_trans[i].virt))
537                         break;
538         }
539         first = i;
540         for (; i < ents; i++) {
541                 if (!in_obp_range(prom_trans[i].virt))
542                         break;
543         }
544         last = i;
545 
546         for (i = 0; i < (last - first); i++) {
547                 struct linux_prom_translation *src = &prom_trans[i + first];
548                 struct linux_prom_translation *dest = &prom_trans[i];
549 
550                 *dest = *src;
551         }
552         for (; i < ents; i++) {
553                 struct linux_prom_translation *dest = &prom_trans[i];
554                 dest->virt = dest->size = dest->data = 0x0UL;
555         }
556 
557         prom_trans_ents = last - first;
558 
559         if (tlb_type == spitfire) {
560                 /* Clear diag TTE bits. */
561                 for (i = 0; i < prom_trans_ents; i++)
562                         prom_trans[i].data &= ~0x0003fe0000000000UL;
563         }
564 
565         /* Force execute bit on.  */
566         for (i = 0; i < prom_trans_ents; i++)
567                 prom_trans[i].data |= (tlb_type == hypervisor ?
568                                        _PAGE_EXEC_4V : _PAGE_EXEC_4U);
569 }
570 
571 static void __init hypervisor_tlb_lock(unsigned long vaddr,
572                                        unsigned long pte,
573                                        unsigned long mmu)
574 {
575         unsigned long ret = sun4v_mmu_map_perm_addr(vaddr, 0, pte, mmu);
576 
577         if (ret != 0) {
578                 prom_printf("hypervisor_tlb_lock[%lx:%x:%lx:%lx]: "
579                             "errors with %lx\n", vaddr, 0, pte, mmu, ret);
580                 prom_halt();
581         }
582 }
583 
584 static unsigned long kern_large_tte(unsigned long paddr);
585 
586 static void __init remap_kernel(void)
587 {
588         unsigned long phys_page, tte_vaddr, tte_data;
589         int i, tlb_ent = sparc64_highest_locked_tlbent();
590 
591         tte_vaddr = (unsigned long) KERNBASE;
592         phys_page = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB;
593         tte_data = kern_large_tte(phys_page);
594 
595         kern_locked_tte_data = tte_data;
596 
597         /* Now lock us into the TLBs via Hypervisor or OBP. */
598         if (tlb_type == hypervisor) {
599                 for (i = 0; i < num_kernel_image_mappings; i++) {
600                         hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_DMMU);
601                         hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_IMMU);
602                         tte_vaddr += 0x400000;
603                         tte_data += 0x400000;
604                 }
605         } else {
606                 for (i = 0; i < num_kernel_image_mappings; i++) {
607                         prom_dtlb_load(tlb_ent - i, tte_data, tte_vaddr);
608                         prom_itlb_load(tlb_ent - i, tte_data, tte_vaddr);
609                         tte_vaddr += 0x400000;
610                         tte_data += 0x400000;
611                 }
612                 sparc64_highest_unlocked_tlb_ent = tlb_ent - i;
613         }
614         if (tlb_type == cheetah_plus) {
615                 sparc64_kern_pri_context = (CTX_CHEETAH_PLUS_CTX0 |
616                                             CTX_CHEETAH_PLUS_NUC);
617                 sparc64_kern_pri_nuc_bits = CTX_CHEETAH_PLUS_NUC;
618                 sparc64_kern_sec_context = CTX_CHEETAH_PLUS_CTX0;
619         }
620 }
621 
622 
623 static void __init inherit_prom_mappings(void)
624 {
625         /* Now fixup OBP's idea about where we really are mapped. */
626         printk("Remapping the kernel... ");
627         remap_kernel();
628         printk("done.\n");
629 }
630 
631 void prom_world(int enter)
632 {
633         if (!enter)
634                 set_fs(get_fs());
635 
636         __asm__ __volatile__("flushw");
637 }
638 
639 void __flush_dcache_range(unsigned long start, unsigned long end)
640 {
641         unsigned long va;
642 
643         if (tlb_type == spitfire) {
644                 int n = 0;
645 
646                 for (va = start; va < end; va += 32) {
647                         spitfire_put_dcache_tag(va & 0x3fe0, 0x0);
648                         if (++n >= 512)
649                                 break;
650                 }
651         } else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
652                 start = __pa(start);
653                 end = __pa(end);
654                 for (va = start; va < end; va += 32)
655                         __asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
656                                              "membar #Sync"
657                                              : /* no outputs */
658                                              : "r" (va),
659                                                "i" (ASI_DCACHE_INVALIDATE));
660         }
661 }
662 EXPORT_SYMBOL(__flush_dcache_range);
663 
664 /* get_new_mmu_context() uses "cache + 1".  */
665 DEFINE_SPINLOCK(ctx_alloc_lock);
666 unsigned long tlb_context_cache = CTX_FIRST_VERSION - 1;
667 #define MAX_CTX_NR      (1UL << CTX_NR_BITS)
668 #define CTX_BMAP_SLOTS  BITS_TO_LONGS(MAX_CTX_NR)
669 DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR);
670 
671 /* Caller does TLB context flushing on local CPU if necessary.
672  * The caller also ensures that CTX_VALID(mm->context) is false.
673  *
674  * We must be careful about boundary cases so that we never
675  * let the user have CTX 0 (nucleus) or we ever use a CTX
676  * version of zero (and thus NO_CONTEXT would not be caught
677  * by version mis-match tests in mmu_context.h).
678  *
679  * Always invoked with interrupts disabled.
680  */
681 void get_new_mmu_context(struct mm_struct *mm)
682 {
683         unsigned long ctx, new_ctx;
684         unsigned long orig_pgsz_bits;
685         int new_version;
686 
687         spin_lock(&ctx_alloc_lock);
688         orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK);
689         ctx = (tlb_context_cache + 1) & CTX_NR_MASK;
690         new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx);
691         new_version = 0;
692         if (new_ctx >= (1 << CTX_NR_BITS)) {
693                 new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1);
694                 if (new_ctx >= ctx) {
695                         int i;
696                         new_ctx = (tlb_context_cache & CTX_VERSION_MASK) +
697                                 CTX_FIRST_VERSION;
698                         if (new_ctx == 1)
699                                 new_ctx = CTX_FIRST_VERSION;
700 
701                         /* Don't call memset, for 16 entries that's just
702                          * plain silly...
703                          */
704                         mmu_context_bmap[0] = 3;
705                         mmu_context_bmap[1] = 0;
706                         mmu_context_bmap[2] = 0;
707                         mmu_context_bmap[3] = 0;
708                         for (i = 4; i < CTX_BMAP_SLOTS; i += 4) {
709                                 mmu_context_bmap[i + 0] = 0;
710                                 mmu_context_bmap[i + 1] = 0;
711                                 mmu_context_bmap[i + 2] = 0;
712                                 mmu_context_bmap[i + 3] = 0;
713                         }
714                         new_version = 1;
715                         goto out;
716                 }
717         }
718         mmu_context_bmap[new_ctx>>6] |= (1UL << (new_ctx & 63));
719         new_ctx |= (tlb_context_cache & CTX_VERSION_MASK);
720 out:
721         tlb_context_cache = new_ctx;
722         mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits;
723         spin_unlock(&ctx_alloc_lock);
724 
725         if (unlikely(new_version))
726                 smp_new_mmu_context_version();
727 }
728 
729 static int numa_enabled = 1;
730 static int numa_debug;
731 
732 static int __init early_numa(char *p)
733 {
734         if (!p)
735                 return 0;
736 
737         if (strstr(p, "off"))
738                 numa_enabled = 0;
739 
740         if (strstr(p, "debug"))
741                 numa_debug = 1;
742 
743         return 0;
744 }
745 early_param("numa", early_numa);
746 
747 #define numadbg(f, a...) \
748 do {    if (numa_debug) \
749                 printk(KERN_INFO f, ## a); \
750 } while (0)
751 
752 static void __init find_ramdisk(unsigned long phys_base)
753 {
754 #ifdef CONFIG_BLK_DEV_INITRD
755         if (sparc_ramdisk_image || sparc_ramdisk_image64) {
756                 unsigned long ramdisk_image;
757 
758                 /* Older versions of the bootloader only supported a
759                  * 32-bit physical address for the ramdisk image
760                  * location, stored at sparc_ramdisk_image.  Newer
761                  * SILO versions set sparc_ramdisk_image to zero and
762                  * provide a full 64-bit physical address at
763                  * sparc_ramdisk_image64.
764                  */
765                 ramdisk_image = sparc_ramdisk_image;
766                 if (!ramdisk_image)
767                         ramdisk_image = sparc_ramdisk_image64;
768 
769                 /* Another bootloader quirk.  The bootloader normalizes
770                  * the physical address to KERNBASE, so we have to
771                  * factor that back out and add in the lowest valid
772                  * physical page address to get the true physical address.
773                  */
774                 ramdisk_image -= KERNBASE;
775                 ramdisk_image += phys_base;
776 
777                 numadbg("Found ramdisk at physical address 0x%lx, size %u\n",
778                         ramdisk_image, sparc_ramdisk_size);
779 
780                 initrd_start = ramdisk_image;
781                 initrd_end = ramdisk_image + sparc_ramdisk_size;
782 
783                 memblock_reserve(initrd_start, sparc_ramdisk_size);
784 
785                 initrd_start += PAGE_OFFSET;
786                 initrd_end += PAGE_OFFSET;
787         }
788 #endif
789 }
790 
791 struct node_mem_mask {
792         unsigned long mask;
793         unsigned long val;
794 };
795 static struct node_mem_mask node_masks[MAX_NUMNODES];
796 static int num_node_masks;
797 
798 int numa_cpu_lookup_table[NR_CPUS];
799 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
800 
801 #ifdef CONFIG_NEED_MULTIPLE_NODES
802 
803 struct mdesc_mblock {
804         u64     base;
805         u64     size;
806         u64     offset; /* RA-to-PA */
807 };
808 static struct mdesc_mblock *mblocks;
809 static int num_mblocks;
810 static int find_numa_node_for_addr(unsigned long pa,
811                                    struct node_mem_mask *pnode_mask);
812 
813 static unsigned long __init ra_to_pa(unsigned long addr)
814 {
815         int i;
816 
817         for (i = 0; i < num_mblocks; i++) {
818                 struct mdesc_mblock *m = &mblocks[i];
819 
820                 if (addr >= m->base &&
821                     addr < (m->base + m->size)) {
822                         addr += m->offset;
823                         break;
824                 }
825         }
826         return addr;
827 }
828 
829 static int __init find_node(unsigned long addr)
830 {
831         static bool search_mdesc = true;
832         static struct node_mem_mask last_mem_mask = { ~0UL, ~0UL };
833         static int last_index;
834         int i;
835 
836         addr = ra_to_pa(addr);
837         for (i = 0; i < num_node_masks; i++) {
838                 struct node_mem_mask *p = &node_masks[i];
839 
840                 if ((addr & p->mask) == p->val)
841                         return i;
842         }
843         /* The following condition has been observed on LDOM guests because
844          * node_masks only contains the best latency mask and value.
845          * LDOM guest's mdesc can contain a single latency group to
846          * cover multiple address range. Print warning message only if the
847          * address cannot be found in node_masks nor mdesc.
848          */
849         if ((search_mdesc) &&
850             ((addr & last_mem_mask.mask) != last_mem_mask.val)) {
851                 /* find the available node in the mdesc */
852                 last_index = find_numa_node_for_addr(addr, &last_mem_mask);
853                 numadbg("find_node: latency group for address 0x%lx is %d\n",
854                         addr, last_index);
855                 if ((last_index < 0) || (last_index >= num_node_masks)) {
856                         /* WARN_ONCE() and use default group 0 */
857                         WARN_ONCE(1, "find_node: A physical address doesn't match a NUMA node rule. Some physical memory will be owned by node 0.");
858                         search_mdesc = false;
859                         last_index = 0;
860                 }
861         }
862 
863         return last_index;
864 }
865 
866 static u64 __init memblock_nid_range(u64 start, u64 end, int *nid)
867 {
868         *nid = find_node(start);
869         start += PAGE_SIZE;
870         while (start < end) {
871                 int n = find_node(start);
872 
873                 if (n != *nid)
874                         break;
875                 start += PAGE_SIZE;
876         }
877 
878         if (start > end)
879                 start = end;
880 
881         return start;
882 }
883 #endif
884 
885 /* This must be invoked after performing all of the necessary
886  * memblock_set_node() calls for 'nid'.  We need to be able to get
887  * correct data from get_pfn_range_for_nid().
888  */
889 static void __init allocate_node_data(int nid)
890 {
891         struct pglist_data *p;
892         unsigned long start_pfn, end_pfn;
893 #ifdef CONFIG_NEED_MULTIPLE_NODES
894         unsigned long paddr;
895 
896         paddr = memblock_alloc_try_nid(sizeof(struct pglist_data), SMP_CACHE_BYTES, nid);
897         if (!paddr) {
898                 prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
899                 prom_halt();
900         }
901         NODE_DATA(nid) = __va(paddr);
902         memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
903 
904         NODE_DATA(nid)->node_id = nid;
905 #endif
906 
907         p = NODE_DATA(nid);
908 
909         get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
910         p->node_start_pfn = start_pfn;
911         p->node_spanned_pages = end_pfn - start_pfn;
912 }
913 
914 static void init_node_masks_nonnuma(void)
915 {
916         int i;
917 
918         numadbg("Initializing tables for non-numa.\n");
919 
920         node_masks[0].mask = node_masks[0].val = 0;
921         num_node_masks = 1;
922 
923         for (i = 0; i < NR_CPUS; i++)
924                 numa_cpu_lookup_table[i] = 0;
925 
926         cpumask_setall(&numa_cpumask_lookup_table[0]);
927 }
928 
929 #ifdef CONFIG_NEED_MULTIPLE_NODES
930 struct pglist_data *node_data[MAX_NUMNODES];
931 
932 EXPORT_SYMBOL(numa_cpu_lookup_table);
933 EXPORT_SYMBOL(numa_cpumask_lookup_table);
934 EXPORT_SYMBOL(node_data);
935 
936 struct mdesc_mlgroup {
937         u64     node;
938         u64     latency;
939         u64     match;
940         u64     mask;
941 };
942 static struct mdesc_mlgroup *mlgroups;
943 static int num_mlgroups;
944 
945 static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio,
946                                    u32 cfg_handle)
947 {
948         u64 arc;
949 
950         mdesc_for_each_arc(arc, md, pio, MDESC_ARC_TYPE_FWD) {
951                 u64 target = mdesc_arc_target(md, arc);
952                 const u64 *val;
953 
954                 val = mdesc_get_property(md, target,
955                                          "cfg-handle", NULL);
956                 if (val && *val == cfg_handle)
957                         return 0;
958         }
959         return -ENODEV;
960 }
961 
962 static int scan_arcs_for_cfg_handle(struct mdesc_handle *md, u64 grp,
963                                     u32 cfg_handle)
964 {
965         u64 arc, candidate, best_latency = ~(u64)0;
966 
967         candidate = MDESC_NODE_NULL;
968         mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
969                 u64 target = mdesc_arc_target(md, arc);
970                 const char *name = mdesc_node_name(md, target);
971                 const u64 *val;
972 
973                 if (strcmp(name, "pio-latency-group"))
974                         continue;
975 
976                 val = mdesc_get_property(md, target, "latency", NULL);
977                 if (!val)
978                         continue;
979 
980                 if (*val < best_latency) {
981                         candidate = target;
982                         best_latency = *val;
983                 }
984         }
985 
986         if (candidate == MDESC_NODE_NULL)
987                 return -ENODEV;
988 
989         return scan_pio_for_cfg_handle(md, candidate, cfg_handle);
990 }
991 
992 int of_node_to_nid(struct device_node *dp)
993 {
994         const struct linux_prom64_registers *regs;
995         struct mdesc_handle *md;
996         u32 cfg_handle;
997         int count, nid;
998         u64 grp;
999 
1000         /* This is the right thing to do on currently supported
1001          * SUN4U NUMA platforms as well, as the PCI controller does
1002          * not sit behind any particular memory controller.
1003          */
1004         if (!mlgroups)
1005                 return -1;
1006 
1007         regs = of_get_property(dp, "reg", NULL);
1008         if (!regs)
1009                 return -1;
1010 
1011         cfg_handle = (regs->phys_addr >> 32UL) & 0x0fffffff;
1012 
1013         md = mdesc_grab();
1014 
1015         count = 0;
1016         nid = -1;
1017         mdesc_for_each_node_by_name(md, grp, "group") {
1018                 if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
1019                         nid = count;
1020                         break;
1021                 }
1022                 count++;
1023         }
1024 
1025         mdesc_release(md);
1026 
1027         return nid;
1028 }
1029 
1030 static void __init add_node_ranges(void)
1031 {
1032         struct memblock_region *reg;
1033 
1034         for_each_memblock(memory, reg) {
1035                 unsigned long size = reg->size;
1036                 unsigned long start, end;
1037 
1038                 start = reg->base;
1039                 end = start + size;
1040                 while (start < end) {
1041                         unsigned long this_end;
1042                         int nid;
1043 
1044                         this_end = memblock_nid_range(start, end, &nid);
1045 
1046                         numadbg("Setting memblock NUMA node nid[%d] "
1047                                 "start[%lx] end[%lx]\n",
1048                                 nid, start, this_end);
1049 
1050                         memblock_set_node(start, this_end - start, nid);
1051                         start = this_end;
1052                 }
1053         }
1054 }
1055 
1056 static int __init grab_mlgroups(struct mdesc_handle *md)
1057 {
1058         unsigned long paddr;
1059         int count = 0;
1060         u64 node;
1061 
1062         mdesc_for_each_node_by_name(md, node, "memory-latency-group")
1063                 count++;
1064         if (!count)
1065                 return -ENOENT;
1066 
1067         paddr = memblock_alloc(count * sizeof(struct mdesc_mlgroup),
1068                           SMP_CACHE_BYTES);
1069         if (!paddr)
1070                 return -ENOMEM;
1071 
1072         mlgroups = __va(paddr);
1073         num_mlgroups = count;
1074 
1075         count = 0;
1076         mdesc_for_each_node_by_name(md, node, "memory-latency-group") {
1077                 struct mdesc_mlgroup *m = &mlgroups[count++];
1078                 const u64 *val;
1079 
1080                 m->node = node;
1081 
1082                 val = mdesc_get_property(md, node, "latency", NULL);
1083                 m->latency = *val;
1084                 val = mdesc_get_property(md, node, "address-match", NULL);
1085                 m->match = *val;
1086                 val = mdesc_get_property(md, node, "address-mask", NULL);
1087                 m->mask = *val;
1088 
1089                 numadbg("MLGROUP[%d]: node[%llx] latency[%llx] "
1090                         "match[%llx] mask[%llx]\n",
1091                         count - 1, m->node, m->latency, m->match, m->mask);
1092         }
1093 
1094         return 0;
1095 }
1096 
1097 static int __init grab_mblocks(struct mdesc_handle *md)
1098 {
1099         unsigned long paddr;
1100         int count = 0;
1101         u64 node;
1102 
1103         mdesc_for_each_node_by_name(md, node, "mblock")
1104                 count++;
1105         if (!count)
1106                 return -ENOENT;
1107 
1108         paddr = memblock_alloc(count * sizeof(struct mdesc_mblock),
1109                           SMP_CACHE_BYTES);
1110         if (!paddr)
1111                 return -ENOMEM;
1112 
1113         mblocks = __va(paddr);
1114         num_mblocks = count;
1115 
1116         count = 0;
1117         mdesc_for_each_node_by_name(md, node, "mblock") {
1118                 struct mdesc_mblock *m = &mblocks[count++];
1119                 const u64 *val;
1120 
1121                 val = mdesc_get_property(md, node, "base", NULL);
1122                 m->base = *val;
1123                 val = mdesc_get_property(md, node, "size", NULL);
1124                 m->size = *val;
1125                 val = mdesc_get_property(md, node,
1126                                          "address-congruence-offset", NULL);
1127 
1128                 /* The address-congruence-offset property is optional.
1129                  * Explicity zero it be identifty this.
1130                  */
1131                 if (val)
1132                         m->offset = *val;
1133                 else
1134                         m->offset = 0UL;
1135 
1136                 numadbg("MBLOCK[%d]: base[%llx] size[%llx] offset[%llx]\n",
1137                         count - 1, m->base, m->size, m->offset);
1138         }
1139 
1140         return 0;
1141 }
1142 
1143 static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md,
1144                                                u64 grp, cpumask_t *mask)
1145 {
1146         u64 arc;
1147 
1148         cpumask_clear(mask);
1149 
1150         mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_BACK) {
1151                 u64 target = mdesc_arc_target(md, arc);
1152                 const char *name = mdesc_node_name(md, target);
1153                 const u64 *id;
1154 
1155                 if (strcmp(name, "cpu"))
1156                         continue;
1157                 id = mdesc_get_property(md, target, "id", NULL);
1158                 if (*id < nr_cpu_ids)
1159                         cpumask_set_cpu(*id, mask);
1160         }
1161 }
1162 
1163 static struct mdesc_mlgroup * __init find_mlgroup(u64 node)
1164 {
1165         int i;
1166 
1167         for (i = 0; i < num_mlgroups; i++) {
1168                 struct mdesc_mlgroup *m = &mlgroups[i];
1169                 if (m->node == node)
1170                         return m;
1171         }
1172         return NULL;
1173 }
1174 
1175 static int find_numa_node_for_addr(unsigned long pa,
1176                                    struct node_mem_mask *pnode_mask)
1177 {
1178         struct mdesc_handle *md = mdesc_grab();
1179         u64 node, arc;
1180         int i = 0;
1181 
1182         node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups");
1183         if (node == MDESC_NODE_NULL)
1184                 goto out;
1185 
1186         mdesc_for_each_node_by_name(md, node, "group") {
1187                 mdesc_for_each_arc(arc, md, node, MDESC_ARC_TYPE_FWD) {
1188                         u64 target = mdesc_arc_target(md, arc);
1189                         struct mdesc_mlgroup *m = find_mlgroup(target);
1190 
1191                         if (!m)
1192                                 continue;
1193                         if ((pa & m->mask) == m->match) {
1194                                 if (pnode_mask) {
1195                                         pnode_mask->mask = m->mask;
1196                                         pnode_mask->val = m->match;
1197                                 }
1198                                 mdesc_release(md);
1199                                 return i;
1200                         }
1201                 }
1202                 i++;
1203         }
1204 
1205 out:
1206         mdesc_release(md);
1207         return -1;
1208 }
1209 
1210 static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp,
1211                                       int index)
1212 {
1213         struct mdesc_mlgroup *candidate = NULL;
1214         u64 arc, best_latency = ~(u64)0;
1215         struct node_mem_mask *n;
1216 
1217         mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
1218                 u64 target = mdesc_arc_target(md, arc);
1219                 struct mdesc_mlgroup *m = find_mlgroup(target);
1220                 if (!m)
1221                         continue;
1222                 if (m->latency < best_latency) {
1223                         candidate = m;
1224                         best_latency = m->latency;
1225                 }
1226         }
1227         if (!candidate)
1228                 return -ENOENT;
1229 
1230         if (num_node_masks != index) {
1231                 printk(KERN_ERR "Inconsistent NUMA state, "
1232                        "index[%d] != num_node_masks[%d]\n",
1233                        index, num_node_masks);
1234                 return -EINVAL;
1235         }
1236 
1237         n = &node_masks[num_node_masks++];
1238 
1239         n->mask = candidate->mask;
1240         n->val = candidate->match;
1241 
1242         numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%llx])\n",
1243                 index, n->mask, n->val, candidate->latency);
1244 
1245         return 0;
1246 }
1247 
1248 static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp,
1249                                          int index)
1250 {
1251         cpumask_t mask;
1252         int cpu;
1253 
1254         numa_parse_mdesc_group_cpus(md, grp, &mask);
1255 
1256         for_each_cpu(cpu, &mask)
1257                 numa_cpu_lookup_table[cpu] = index;
1258         cpumask_copy(&numa_cpumask_lookup_table[index], &mask);
1259 
1260         if (numa_debug) {
1261                 printk(KERN_INFO "NUMA GROUP[%d]: cpus [ ", index);
1262                 for_each_cpu(cpu, &mask)
1263                         printk("%d ", cpu);
1264                 printk("]\n");
1265         }
1266 
1267         return numa_attach_mlgroup(md, grp, index);
1268 }
1269 
1270 static int __init numa_parse_mdesc(void)
1271 {
1272         struct mdesc_handle *md = mdesc_grab();
1273         int i, err, count;
1274         u64 node;
1275 
1276         node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups");
1277         if (node == MDESC_NODE_NULL) {
1278                 mdesc_release(md);
1279                 return -ENOENT;
1280         }
1281 
1282         err = grab_mblocks(md);
1283         if (err < 0)
1284                 goto out;
1285 
1286         err = grab_mlgroups(md);
1287         if (err < 0)
1288                 goto out;
1289 
1290         count = 0;
1291         mdesc_for_each_node_by_name(md, node, "group") {
1292                 err = numa_parse_mdesc_group(md, node, count);
1293                 if (err < 0)
1294                         break;
1295                 count++;
1296         }
1297 
1298         add_node_ranges();
1299 
1300         for (i = 0; i < num_node_masks; i++) {
1301                 allocate_node_data(i);
1302                 node_set_online(i);
1303         }
1304 
1305         err = 0;
1306 out:
1307         mdesc_release(md);
1308         return err;
1309 }
1310 
1311 static int __init numa_parse_jbus(void)
1312 {
1313         unsigned long cpu, index;
1314 
1315         /* NUMA node id is encoded in bits 36 and higher, and there is
1316          * a 1-to-1 mapping from CPU ID to NUMA node ID.
1317          */
1318         index = 0;
1319         for_each_present_cpu(cpu) {
1320                 numa_cpu_lookup_table[cpu] = index;
1321                 cpumask_copy(&numa_cpumask_lookup_table[index], cpumask_of(cpu));
1322                 node_masks[index].mask = ~((1UL << 36UL) - 1UL);
1323                 node_masks[index].val = cpu << 36UL;
1324 
1325                 index++;
1326         }
1327         num_node_masks = index;
1328 
1329         add_node_ranges();
1330 
1331         for (index = 0; index < num_node_masks; index++) {
1332                 allocate_node_data(index);
1333                 node_set_online(index);
1334         }
1335 
1336         return 0;
1337 }
1338 
1339 static int __init numa_parse_sun4u(void)
1340 {
1341         if (tlb_type == cheetah || tlb_type == cheetah_plus) {
1342                 unsigned long ver;
1343 
1344                 __asm__ ("rdpr %%ver, %0" : "=r" (ver));
1345                 if ((ver >> 32UL) == __JALAPENO_ID ||
1346                     (ver >> 32UL) == __SERRANO_ID)
1347                         return numa_parse_jbus();
1348         }
1349         return -1;
1350 }
1351 
1352 static int __init bootmem_init_numa(void)
1353 {
1354         int err = -1;
1355 
1356         numadbg("bootmem_init_numa()\n");
1357 
1358         if (numa_enabled) {
1359                 if (tlb_type == hypervisor)
1360                         err = numa_parse_mdesc();
1361                 else
1362                         err = numa_parse_sun4u();
1363         }
1364         return err;
1365 }
1366 
1367 #else
1368 
1369 static int bootmem_init_numa(void)
1370 {
1371         return -1;
1372 }
1373 
1374 #endif
1375 
1376 static void __init bootmem_init_nonnuma(void)
1377 {
1378         unsigned long top_of_ram = memblock_end_of_DRAM();
1379         unsigned long total_ram = memblock_phys_mem_size();
1380 
1381         numadbg("bootmem_init_nonnuma()\n");
1382 
1383         printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
1384                top_of_ram, total_ram);
1385         printk(KERN_INFO "Memory hole size: %ldMB\n",
1386                (top_of_ram - total_ram) >> 20);
1387 
1388         init_node_masks_nonnuma();
1389         memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
1390         allocate_node_data(0);
1391         node_set_online(0);
1392 }
1393 
1394 static unsigned long __init bootmem_init(unsigned long phys_base)
1395 {
1396         unsigned long end_pfn;
1397 
1398         end_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
1399         max_pfn = max_low_pfn = end_pfn;
1400         min_low_pfn = (phys_base >> PAGE_SHIFT);
1401 
1402         if (bootmem_init_numa() < 0)
1403                 bootmem_init_nonnuma();
1404 
1405         /* Dump memblock with node info. */
1406         memblock_dump_all();
1407 
1408         /* XXX cpu notifier XXX */
1409 
1410         sparse_memory_present_with_active_regions(MAX_NUMNODES);
1411         sparse_init();
1412 
1413         return end_pfn;
1414 }
1415 
1416 static struct linux_prom64_registers pall[MAX_BANKS] __initdata;
1417 static int pall_ents __initdata;
1418 
1419 static unsigned long max_phys_bits = 40;
1420 
1421 bool kern_addr_valid(unsigned long addr)
1422 {
1423         pgd_t *pgd;
1424         pud_t *pud;
1425         pmd_t *pmd;
1426         pte_t *pte;
1427 
1428         if ((long)addr < 0L) {
1429                 unsigned long pa = __pa(addr);
1430 
1431                 if ((addr >> max_phys_bits) != 0UL)
1432                         return false;
1433 
1434                 return pfn_valid(pa >> PAGE_SHIFT);
1435         }
1436 
1437         if (addr >= (unsigned long) KERNBASE &&
1438             addr < (unsigned long)&_end)
1439                 return true;
1440 
1441         pgd = pgd_offset_k(addr);
1442         if (pgd_none(*pgd))
1443                 return 0;
1444 
1445         pud = pud_offset(pgd, addr);
1446         if (pud_none(*pud))
1447                 return 0;
1448 
1449         if (pud_large(*pud))
1450                 return pfn_valid(pud_pfn(*pud));
1451 
1452         pmd = pmd_offset(pud, addr);
1453         if (pmd_none(*pmd))
1454                 return 0;
1455 
1456         if (pmd_large(*pmd))
1457                 return pfn_valid(pmd_pfn(*pmd));
1458 
1459         pte = pte_offset_kernel(pmd, addr);
1460         if (pte_none(*pte))
1461                 return 0;
1462 
1463         return pfn_valid(pte_pfn(*pte));
1464 }
1465 EXPORT_SYMBOL(kern_addr_valid);
1466 
1467 static unsigned long __ref kernel_map_hugepud(unsigned long vstart,
1468                                               unsigned long vend,
1469                                               pud_t *pud)
1470 {
1471         const unsigned long mask16gb = (1UL << 34) - 1UL;
1472         u64 pte_val = vstart;
1473 
1474         /* Each PUD is 8GB */
1475         if ((vstart & mask16gb) ||
1476             (vend - vstart <= mask16gb)) {
1477                 pte_val ^= kern_linear_pte_xor[2];
1478                 pud_val(*pud) = pte_val | _PAGE_PUD_HUGE;
1479 
1480                 return vstart + PUD_SIZE;
1481         }
1482 
1483         pte_val ^= kern_linear_pte_xor[3];
1484         pte_val |= _PAGE_PUD_HUGE;
1485 
1486         vend = vstart + mask16gb + 1UL;
1487         while (vstart < vend) {
1488                 pud_val(*pud) = pte_val;
1489 
1490                 pte_val += PUD_SIZE;
1491                 vstart += PUD_SIZE;
1492                 pud++;
1493         }
1494         return vstart;
1495 }
1496 
1497 static bool kernel_can_map_hugepud(unsigned long vstart, unsigned long vend,
1498                                    bool guard)
1499 {
1500         if (guard && !(vstart & ~PUD_MASK) && (vend - vstart) >= PUD_SIZE)
1501                 return true;
1502 
1503         return false;
1504 }
1505 
1506 static unsigned long __ref kernel_map_hugepmd(unsigned long vstart,
1507                                               unsigned long vend,
1508                                               pmd_t *pmd)
1509 {
1510         const unsigned long mask256mb = (1UL << 28) - 1UL;
1511         const unsigned long mask2gb = (1UL << 31) - 1UL;
1512         u64 pte_val = vstart;
1513 
1514         /* Each PMD is 8MB */
1515         if ((vstart & mask256mb) ||
1516             (vend - vstart <= mask256mb)) {
1517                 pte_val ^= kern_linear_pte_xor[0];
1518                 pmd_val(*pmd) = pte_val | _PAGE_PMD_HUGE;
1519 
1520                 return vstart + PMD_SIZE;
1521         }
1522 
1523         if ((vstart & mask2gb) ||
1524             (vend - vstart <= mask2gb)) {
1525                 pte_val ^= kern_linear_pte_xor[1];
1526                 pte_val |= _PAGE_PMD_HUGE;
1527                 vend = vstart + mask256mb + 1UL;
1528         } else {
1529                 pte_val ^= kern_linear_pte_xor[2];
1530                 pte_val |= _PAGE_PMD_HUGE;
1531                 vend = vstart + mask2gb + 1UL;
1532         }
1533 
1534         while (vstart < vend) {
1535                 pmd_val(*pmd) = pte_val;
1536 
1537                 pte_val += PMD_SIZE;
1538                 vstart += PMD_SIZE;
1539                 pmd++;
1540         }
1541 
1542         return vstart;
1543 }
1544 
1545 static bool kernel_can_map_hugepmd(unsigned long vstart, unsigned long vend,
1546                                    bool guard)
1547 {
1548         if (guard && !(vstart & ~PMD_MASK) && (vend - vstart) >= PMD_SIZE)
1549                 return true;
1550 
1551         return false;
1552 }
1553 
1554 static unsigned long __ref kernel_map_range(unsigned long pstart,
1555                                             unsigned long pend, pgprot_t prot,
1556                                             bool use_huge)
1557 {
1558         unsigned long vstart = PAGE_OFFSET + pstart;
1559         unsigned long vend = PAGE_OFFSET + pend;
1560         unsigned long alloc_bytes = 0UL;
1561 
1562         if ((vstart & ~PAGE_MASK) || (vend & ~PAGE_MASK)) {
1563                 prom_printf("kernel_map: Unaligned physmem[%lx:%lx]\n",
1564                             vstart, vend);
1565                 prom_halt();
1566         }
1567 
1568         while (vstart < vend) {
1569                 unsigned long this_end, paddr = __pa(vstart);
1570                 pgd_t *pgd = pgd_offset_k(vstart);
1571                 pud_t *pud;
1572                 pmd_t *pmd;
1573                 pte_t *pte;
1574 
1575                 if (pgd_none(*pgd)) {
1576                         pud_t *new;
1577 
1578                         new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
1579                         alloc_bytes += PAGE_SIZE;
1580                         pgd_populate(&init_mm, pgd, new);
1581                 }
1582                 pud = pud_offset(pgd, vstart);
1583                 if (pud_none(*pud)) {
1584                         pmd_t *new;
1585 
1586                         if (kernel_can_map_hugepud(vstart, vend, use_huge)) {
1587                                 vstart = kernel_map_hugepud(vstart, vend, pud);
1588                                 continue;
1589                         }
1590                         new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
1591                         alloc_bytes += PAGE_SIZE;
1592                         pud_populate(&init_mm, pud, new);
1593                 }
1594 
1595                 pmd = pmd_offset(pud, vstart);
1596                 if (pmd_none(*pmd)) {
1597                         pte_t *new;
1598 
1599                         if (kernel_can_map_hugepmd(vstart, vend, use_huge)) {
1600                                 vstart = kernel_map_hugepmd(vstart, vend, pmd);
1601                                 continue;
1602                         }
1603                         new = __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
1604                         alloc_bytes += PAGE_SIZE;
1605                         pmd_populate_kernel(&init_mm, pmd, new);
1606                 }
1607 
1608                 pte = pte_offset_kernel(pmd, vstart);
1609                 this_end = (vstart + PMD_SIZE) & PMD_MASK;
1610                 if (this_end > vend)
1611                         this_end = vend;
1612 
1613                 while (vstart < this_end) {
1614                         pte_val(*pte) = (paddr | pgprot_val(prot));
1615 
1616                         vstart += PAGE_SIZE;
1617                         paddr += PAGE_SIZE;
1618                         pte++;
1619                 }
1620         }
1621 
1622         return alloc_bytes;
1623 }
1624 
1625 static void __init flush_all_kernel_tsbs(void)
1626 {
1627         int i;
1628 
1629         for (i = 0; i < KERNEL_TSB_NENTRIES; i++) {
1630                 struct tsb *ent = &swapper_tsb[i];
1631 
1632                 ent->tag = (1UL << TSB_TAG_INVALID_BIT);
1633         }
1634 #ifndef CONFIG_DEBUG_PAGEALLOC
1635         for (i = 0; i < KERNEL_TSB4M_NENTRIES; i++) {
1636                 struct tsb *ent = &swapper_4m_tsb[i];
1637 
1638                 ent->tag = (1UL << TSB_TAG_INVALID_BIT);
1639         }
1640 #endif
1641 }
1642 
1643 extern unsigned int kvmap_linear_patch[1];
1644 
1645 static void __init kernel_physical_mapping_init(void)
1646 {
1647         unsigned long i, mem_alloced = 0UL;
1648         bool use_huge = true;
1649 
1650 #ifdef CONFIG_DEBUG_PAGEALLOC
1651         use_huge = false;
1652 #endif
1653         for (i = 0; i < pall_ents; i++) {
1654                 unsigned long phys_start, phys_end;
1655 
1656                 phys_start = pall[i].phys_addr;
1657                 phys_end = phys_start + pall[i].reg_size;
1658 
1659                 mem_alloced += kernel_map_range(phys_start, phys_end,
1660                                                 PAGE_KERNEL, use_huge);
1661         }
1662 
1663         printk("Allocated %ld bytes for kernel page tables.\n",
1664                mem_alloced);
1665 
1666         kvmap_linear_patch[0] = 0x01000000; /* nop */
1667         flushi(&kvmap_linear_patch[0]);
1668 
1669         flush_all_kernel_tsbs();
1670 
1671         __flush_tlb_all();
1672 }
1673 
1674 #ifdef CONFIG_DEBUG_PAGEALLOC
1675 void kernel_map_pages(struct page *page, int numpages, int enable)
1676 {
1677         unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT;
1678         unsigned long phys_end = phys_start + (numpages * PAGE_SIZE);
1679 
1680         kernel_map_range(phys_start, phys_end,
1681                          (enable ? PAGE_KERNEL : __pgprot(0)), false);
1682 
1683         flush_tsb_kernel_range(PAGE_OFFSET + phys_start,
1684                                PAGE_OFFSET + phys_end);
1685 
1686         /* we should perform an IPI and flush all tlbs,
1687          * but that can deadlock->flush only current cpu.
1688          */
1689         __flush_tlb_kernel_range(PAGE_OFFSET + phys_start,
1690                                  PAGE_OFFSET + phys_end);
1691 }
1692 #endif
1693 
1694 unsigned long __init find_ecache_flush_span(unsigned long size)
1695 {
1696         int i;
1697 
1698         for (i = 0; i < pavail_ents; i++) {
1699                 if (pavail[i].reg_size >= size)
1700                         return pavail[i].phys_addr;
1701         }
1702 
1703         return ~0UL;
1704 }
1705 
1706 unsigned long PAGE_OFFSET;
1707 EXPORT_SYMBOL(PAGE_OFFSET);
1708 
1709 unsigned long VMALLOC_END   = 0x0000010000000000UL;
1710 EXPORT_SYMBOL(VMALLOC_END);
1711 
1712 unsigned long sparc64_va_hole_top =    0xfffff80000000000UL;
1713 unsigned long sparc64_va_hole_bottom = 0x0000080000000000UL;
1714 
1715 static void __init setup_page_offset(void)
1716 {
1717         if (tlb_type == cheetah || tlb_type == cheetah_plus) {
1718                 /* Cheetah/Panther support a full 64-bit virtual
1719                  * address, so we can use all that our page tables
1720                  * support.
1721                  */
1722                 sparc64_va_hole_top =    0xfff0000000000000UL;
1723                 sparc64_va_hole_bottom = 0x0010000000000000UL;
1724 
1725                 max_phys_bits = 42;
1726         } else if (tlb_type == hypervisor) {
1727                 switch (sun4v_chip_type) {
1728                 case SUN4V_CHIP_NIAGARA1:
1729                 case SUN4V_CHIP_NIAGARA2:
1730                         /* T1 and T2 support 48-bit virtual addresses.  */
1731                         sparc64_va_hole_top =    0xffff800000000000UL;
1732                         sparc64_va_hole_bottom = 0x0000800000000000UL;
1733 
1734                         max_phys_bits = 39;
1735                         break;
1736                 case SUN4V_CHIP_NIAGARA3:
1737                         /* T3 supports 48-bit virtual addresses.  */
1738                         sparc64_va_hole_top =    0xffff800000000000UL;
1739                         sparc64_va_hole_bottom = 0x0000800000000000UL;
1740 
1741                         max_phys_bits = 43;
1742                         break;
1743                 case SUN4V_CHIP_NIAGARA4:
1744                 case SUN4V_CHIP_NIAGARA5:
1745                 case SUN4V_CHIP_SPARC64X:
1746                 case SUN4V_CHIP_SPARC_M6:
1747                         /* T4 and later support 52-bit virtual addresses.  */
1748                         sparc64_va_hole_top =    0xfff8000000000000UL;
1749                         sparc64_va_hole_bottom = 0x0008000000000000UL;
1750                         max_phys_bits = 47;
1751                         break;
1752                 case SUN4V_CHIP_SPARC_M7:
1753                 default:
1754                         /* M7 and later support 52-bit virtual addresses.  */
1755                         sparc64_va_hole_top =    0xfff8000000000000UL;
1756                         sparc64_va_hole_bottom = 0x0008000000000000UL;
1757                         max_phys_bits = 49;
1758                         break;
1759                 }
1760         }
1761 
1762         if (max_phys_bits > MAX_PHYS_ADDRESS_BITS) {
1763                 prom_printf("MAX_PHYS_ADDRESS_BITS is too small, need %lu\n",
1764                             max_phys_bits);
1765                 prom_halt();
1766         }
1767 
1768         PAGE_OFFSET = sparc64_va_hole_top;
1769         VMALLOC_END = ((sparc64_va_hole_bottom >> 1) +
1770                        (sparc64_va_hole_bottom >> 2));
1771 
1772         pr_info("MM: PAGE_OFFSET is 0x%016lx (max_phys_bits == %lu)\n",
1773                 PAGE_OFFSET, max_phys_bits);
1774         pr_info("MM: VMALLOC [0x%016lx --> 0x%016lx]\n",
1775                 VMALLOC_START, VMALLOC_END);
1776         pr_info("MM: VMEMMAP [0x%016lx --> 0x%016lx]\n",
1777                 VMEMMAP_BASE, VMEMMAP_BASE << 1);
1778 }
1779 
1780 static void __init tsb_phys_patch(void)
1781 {
1782         struct tsb_ldquad_phys_patch_entry *pquad;
1783         struct tsb_phys_patch_entry *p;
1784 
1785         pquad = &__tsb_ldquad_phys_patch;
1786         while (pquad < &__tsb_ldquad_phys_patch_end) {
1787                 unsigned long addr = pquad->addr;
1788 
1789                 if (tlb_type == hypervisor)
1790                         *(unsigned int *) addr = pquad->sun4v_insn;
1791                 else
1792                         *(unsigned int *) addr = pquad->sun4u_insn;
1793                 wmb();
1794                 __asm__ __volatile__("flush     %0"
1795                                      : /* no outputs */
1796                                      : "r" (addr));
1797 
1798                 pquad++;
1799         }
1800 
1801         p = &__tsb_phys_patch;
1802         while (p < &__tsb_phys_patch_end) {
1803                 unsigned long addr = p->addr;
1804 
1805                 *(unsigned int *) addr = p->insn;
1806                 wmb();
1807                 __asm__ __volatile__("flush     %0"
1808                                      : /* no outputs */
1809                                      : "r" (addr));
1810 
1811                 p++;
1812         }
1813 }
1814 
1815 /* Don't mark as init, we give this to the Hypervisor.  */
1816 #ifndef CONFIG_DEBUG_PAGEALLOC
1817 #define NUM_KTSB_DESCR  2
1818 #else
1819 #define NUM_KTSB_DESCR  1
1820 #endif
1821 static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR];
1822 
1823 /* The swapper TSBs are loaded with a base sequence of:
1824  *
1825  *      sethi   %uhi(SYMBOL), REG1
1826  *      sethi   %hi(SYMBOL), REG2
1827  *      or      REG1, %ulo(SYMBOL), REG1
1828  *      or      REG2, %lo(SYMBOL), REG2
1829  *      sllx    REG1, 32, REG1
1830  *      or      REG1, REG2, REG1
1831  *
1832  * When we use physical addressing for the TSB accesses, we patch the
1833  * first four instructions in the above sequence.
1834  */
1835 
1836 static void patch_one_ktsb_phys(unsigned int *start, unsigned int *end, unsigned long pa)
1837 {
1838         unsigned long high_bits, low_bits;
1839 
1840         high_bits = (pa >> 32) & 0xffffffff;
1841         low_bits = (pa >> 0) & 0xffffffff;
1842 
1843         while (start < end) {
1844                 unsigned int *ia = (unsigned int *)(unsigned long)*start;
1845 
1846                 ia[0] = (ia[0] & ~0x3fffff) | (high_bits >> 10);
1847                 __asm__ __volatile__("flush     %0" : : "r" (ia));
1848 
1849                 ia[1] = (ia[1] & ~0x3fffff) | (low_bits >> 10);
1850                 __asm__ __volatile__("flush     %0" : : "r" (ia + 1));
1851 
1852                 ia[2] = (ia[2] & ~0x1fff) | (high_bits & 0x3ff);
1853                 __asm__ __volatile__("flush     %0" : : "r" (ia + 2));
1854 
1855                 ia[3] = (ia[3] & ~0x1fff) | (low_bits & 0x3ff);
1856                 __asm__ __volatile__("flush     %0" : : "r" (ia + 3));
1857 
1858                 start++;
1859         }
1860 }
1861 
1862 static void ktsb_phys_patch(void)
1863 {
1864         extern unsigned int __swapper_tsb_phys_patch;
1865         extern unsigned int __swapper_tsb_phys_patch_end;
1866         unsigned long ktsb_pa;
1867 
1868         ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE);
1869         patch_one_ktsb_phys(&__swapper_tsb_phys_patch,
1870                             &__swapper_tsb_phys_patch_end, ktsb_pa);
1871 #ifndef CONFIG_DEBUG_PAGEALLOC
1872         {
1873         extern unsigned int __swapper_4m_tsb_phys_patch;
1874         extern unsigned int __swapper_4m_tsb_phys_patch_end;
1875         ktsb_pa = (kern_base +
1876                    ((unsigned long)&swapper_4m_tsb[0] - KERNBASE));
1877         patch_one_ktsb_phys(&__swapper_4m_tsb_phys_patch,
1878                             &__swapper_4m_tsb_phys_patch_end, ktsb_pa);
1879         }
1880 #endif
1881 }
1882 
1883 static void __init sun4v_ktsb_init(void)
1884 {
1885         unsigned long ktsb_pa;
1886 
1887         /* First KTSB for PAGE_SIZE mappings.  */
1888         ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE);
1889 
1890         switch (PAGE_SIZE) {
1891         case 8 * 1024:
1892         default:
1893                 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_8K;
1894                 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_8K;
1895                 break;
1896 
1897         case 64 * 1024:
1898                 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_64K;
1899                 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_64K;
1900                 break;
1901 
1902         case 512 * 1024:
1903                 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_512K;
1904                 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_512K;
1905                 break;
1906 
1907         case 4 * 1024 * 1024:
1908                 ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_4MB;
1909                 ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_4MB;
1910                 break;
1911         }
1912 
1913         ktsb_descr[0].assoc = 1;
1914         ktsb_descr[0].num_ttes = KERNEL_TSB_NENTRIES;
1915         ktsb_descr[0].ctx_idx = 0;
1916         ktsb_descr[0].tsb_base = ktsb_pa;
1917         ktsb_descr[0].resv = 0;
1918 
1919 #ifndef CONFIG_DEBUG_PAGEALLOC
1920         /* Second KTSB for 4MB/256MB/2GB/16GB mappings.  */
1921         ktsb_pa = (kern_base +
1922                    ((unsigned long)&swapper_4m_tsb[0] - KERNBASE));
1923 
1924         ktsb_descr[1].pgsz_idx = HV_PGSZ_IDX_4MB;
1925         ktsb_descr[1].pgsz_mask = ((HV_PGSZ_MASK_4MB |
1926                                     HV_PGSZ_MASK_256MB |
1927                                     HV_PGSZ_MASK_2GB |
1928                                     HV_PGSZ_MASK_16GB) &
1929                                    cpu_pgsz_mask);
1930         ktsb_descr[1].assoc = 1;
1931         ktsb_descr[1].num_ttes = KERNEL_TSB4M_NENTRIES;
1932         ktsb_descr[1].ctx_idx = 0;
1933         ktsb_descr[1].tsb_base = ktsb_pa;
1934         ktsb_descr[1].resv = 0;
1935 #endif
1936 }
1937 
1938 void sun4v_ktsb_register(void)
1939 {
1940         unsigned long pa, ret;
1941 
1942         pa = kern_base + ((unsigned long)&ktsb_descr[0] - KERNBASE);
1943 
1944         ret = sun4v_mmu_tsb_ctx0(NUM_KTSB_DESCR, pa);
1945         if (ret != 0) {
1946                 prom_printf("hypervisor_mmu_tsb_ctx0[%lx]: "
1947                             "errors with %lx\n", pa, ret);
1948                 prom_halt();
1949         }
1950 }
1951 
1952 static void __init sun4u_linear_pte_xor_finalize(void)
1953 {
1954 #ifndef CONFIG_DEBUG_PAGEALLOC
1955         /* This is where we would add Panther support for
1956          * 32MB and 256MB pages.
1957          */
1958 #endif
1959 }
1960 
1961 static void __init sun4v_linear_pte_xor_finalize(void)
1962 {
1963 #ifndef CONFIG_DEBUG_PAGEALLOC
1964         if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) {
1965                 kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^
1966                         PAGE_OFFSET;
1967                 kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V |
1968                                            _PAGE_P_4V | _PAGE_W_4V);
1969         } else {
1970                 kern_linear_pte_xor[1] = kern_linear_pte_xor[0];
1971         }
1972 
1973         if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) {
1974                 kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^
1975                         PAGE_OFFSET;
1976                 kern_linear_pte_xor[2] |= (_PAGE_CP_4V | _PAGE_CV_4V |
1977                                            _PAGE_P_4V | _PAGE_W_4V);
1978         } else {
1979                 kern_linear_pte_xor[2] = kern_linear_pte_xor[1];
1980         }
1981 
1982         if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) {
1983                 kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^
1984                         PAGE_OFFSET;
1985                 kern_linear_pte_xor[3] |= (_PAGE_CP_4V | _PAGE_CV_4V |
1986                                            _PAGE_P_4V | _PAGE_W_4V);
1987         } else {
1988                 kern_linear_pte_xor[3] = kern_linear_pte_xor[2];
1989         }
1990 #endif
1991 }
1992 
1993 /* paging_init() sets up the page tables */
1994 
1995 static unsigned long last_valid_pfn;
1996 
1997 static void sun4u_pgprot_init(void);
1998 static void sun4v_pgprot_init(void);
1999 
2000 void __init paging_init(void)
2001 {
2002         unsigned long end_pfn, shift, phys_base;
2003         unsigned long real_end, i;
2004         int node;
2005 
2006         setup_page_offset();
2007 
2008         /* These build time checkes make sure that the dcache_dirty_cpu()
2009          * page->flags usage will work.
2010          *
2011          * When a page gets marked as dcache-dirty, we store the
2012          * cpu number starting at bit 32 in the page->flags.  Also,
2013          * functions like clear_dcache_dirty_cpu use the cpu mask
2014          * in 13-bit signed-immediate instruction fields.
2015          */
2016 
2017         /*
2018          * Page flags must not reach into upper 32 bits that are used
2019          * for the cpu number
2020          */
2021         BUILD_BUG_ON(NR_PAGEFLAGS > 32);
2022 
2023         /*
2024          * The bit fields placed in the high range must not reach below
2025          * the 32 bit boundary. Otherwise we cannot place the cpu field
2026          * at the 32 bit boundary.
2027          */
2028         BUILD_BUG_ON(SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH +
2029                 ilog2(roundup_pow_of_two(NR_CPUS)) > 32);
2030 
2031         BUILD_BUG_ON(NR_CPUS > 4096);
2032 
2033         kern_base = (prom_boot_mapping_phys_low >> ILOG2_4MB) << ILOG2_4MB;
2034         kern_size = (unsigned long)&_end - (unsigned long)KERNBASE;
2035 
2036         /* Invalidate both kernel TSBs.  */
2037         memset(swapper_tsb, 0x40, sizeof(swapper_tsb));
2038 #ifndef CONFIG_DEBUG_PAGEALLOC
2039         memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb));
2040 #endif
2041 
2042         if (tlb_type == hypervisor)
2043                 sun4v_pgprot_init();
2044         else
2045                 sun4u_pgprot_init();
2046 
2047         if (tlb_type == cheetah_plus ||
2048             tlb_type == hypervisor) {
2049                 tsb_phys_patch();
2050                 ktsb_phys_patch();
2051         }
2052 
2053         if (tlb_type == hypervisor)
2054                 sun4v_patch_tlb_handlers();
2055 
2056         /* Find available physical memory...
2057          *
2058          * Read it twice in order to work around a bug in openfirmware.
2059          * The call to grab this table itself can cause openfirmware to
2060          * allocate memory, which in turn can take away some space from
2061          * the list of available memory.  Reading it twice makes sure
2062          * we really do get the final value.
2063          */
2064         read_obp_translations();
2065         read_obp_memory("reg", &pall[0], &pall_ents);
2066         read_obp_memory("available", &pavail[0], &pavail_ents);
2067         read_obp_memory("available", &pavail[0], &pavail_ents);
2068 
2069         phys_base = 0xffffffffffffffffUL;
2070         for (i = 0; i < pavail_ents; i++) {
2071                 phys_base = min(phys_base, pavail[i].phys_addr);
2072                 memblock_add(pavail[i].phys_addr, pavail[i].reg_size);
2073         }
2074 
2075         memblock_reserve(kern_base, kern_size);
2076 
2077         find_ramdisk(phys_base);
2078 
2079         memblock_enforce_memory_limit(cmdline_memory_size);
2080 
2081         memblock_allow_resize();
2082         memblock_dump_all();
2083 
2084         set_bit(0, mmu_context_bmap);
2085 
2086         shift = kern_base + PAGE_OFFSET - ((unsigned long)KERNBASE);
2087 
2088         real_end = (unsigned long)_end;
2089         num_kernel_image_mappings = DIV_ROUND_UP(real_end - KERNBASE, 1 << ILOG2_4MB);
2090         printk("Kernel: Using %d locked TLB entries for main kernel image.\n",
2091                num_kernel_image_mappings);
2092 
2093         /* Set kernel pgd to upper alias so physical page computations
2094          * work.
2095          */
2096         init_mm.pgd += ((shift) / (sizeof(pgd_t)));
2097         
2098         memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir));
2099 
2100         inherit_prom_mappings();
2101         
2102         /* Ok, we can use our TLB miss and window trap handlers safely.  */
2103         setup_tba();
2104 
2105         __flush_tlb_all();
2106 
2107         prom_build_devicetree();
2108         of_populate_present_mask();
2109 #ifndef CONFIG_SMP
2110         of_fill_in_cpu_data();
2111 #endif
2112 
2113         if (tlb_type == hypervisor) {
2114                 sun4v_mdesc_init();
2115                 mdesc_populate_present_mask(cpu_all_mask);
2116 #ifndef CONFIG_SMP
2117                 mdesc_fill_in_cpu_data(cpu_all_mask);
2118 #endif
2119                 mdesc_get_page_sizes(cpu_all_mask, &cpu_pgsz_mask);
2120 
2121                 sun4v_linear_pte_xor_finalize();
2122 
2123                 sun4v_ktsb_init();
2124                 sun4v_ktsb_register();
2125         } else {
2126                 unsigned long impl, ver;
2127 
2128                 cpu_pgsz_mask = (HV_PGSZ_MASK_8K | HV_PGSZ_MASK_64K |
2129                                  HV_PGSZ_MASK_512K | HV_PGSZ_MASK_4MB);
2130 
2131                 __asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver));
2132                 impl = ((ver >> 32) & 0xffff);
2133                 if (impl == PANTHER_IMPL)
2134                         cpu_pgsz_mask |= (HV_PGSZ_MASK_32MB |
2135                                           HV_PGSZ_MASK_256MB);
2136 
2137                 sun4u_linear_pte_xor_finalize();
2138         }
2139 
2140         /* Flush the TLBs and the 4M TSB so that the updated linear
2141          * pte XOR settings are realized for all mappings.
2142          */
2143         __flush_tlb_all();
2144 #ifndef CONFIG_DEBUG_PAGEALLOC
2145         memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb));
2146 #endif
2147         __flush_tlb_all();
2148 
2149         /* Setup bootmem... */
2150         last_valid_pfn = end_pfn = bootmem_init(phys_base);
2151 
2152         /* Once the OF device tree and MDESC have been setup, we know
2153          * the list of possible cpus.  Therefore we can allocate the
2154          * IRQ stacks.
2155          */
2156         for_each_possible_cpu(i) {
2157                 node = cpu_to_node(i);
2158 
2159                 softirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node),
2160                                                         THREAD_SIZE,
2161                                                         THREAD_SIZE, 0);
2162                 hardirq_stack[i] = __alloc_bootmem_node(NODE_DATA(node),
2163                                                         THREAD_SIZE,
2164                                                         THREAD_SIZE, 0);
2165         }
2166 
2167         kernel_physical_mapping_init();
2168 
2169         {
2170                 unsigned long max_zone_pfns[MAX_NR_ZONES];
2171 
2172                 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
2173 
2174                 max_zone_pfns[ZONE_NORMAL] = end_pfn;
2175 
2176                 free_area_init_nodes(max_zone_pfns);
2177         }
2178 
2179         printk("Booting Linux...\n");
2180 }
2181 
2182 int page_in_phys_avail(unsigned long paddr)
2183 {
2184         int i;
2185 
2186         paddr &= PAGE_MASK;
2187 
2188         for (i = 0; i < pavail_ents; i++) {
2189                 unsigned long start, end;
2190 
2191                 start = pavail[i].phys_addr;
2192                 end = start + pavail[i].reg_size;
2193 
2194                 if (paddr >= start && paddr < end)
2195                         return 1;
2196         }
2197         if (paddr >= kern_base && paddr < (kern_base + kern_size))
2198                 return 1;
2199 #ifdef CONFIG_BLK_DEV_INITRD
2200         if (paddr >= __pa(initrd_start) &&
2201             paddr < __pa(PAGE_ALIGN(initrd_end)))
2202                 return 1;
2203 #endif
2204 
2205         return 0;
2206 }
2207 
2208 static void __init register_page_bootmem_info(void)
2209 {
2210 #ifdef CONFIG_NEED_MULTIPLE_NODES
2211         int i;
2212 
2213         for_each_online_node(i)
2214                 if (NODE_DATA(i)->node_spanned_pages)
2215                         register_page_bootmem_info_node(NODE_DATA(i));
2216 #endif
2217 }
2218 void __init mem_init(void)
2219 {
2220         high_memory = __va(last_valid_pfn << PAGE_SHIFT);
2221 
2222         register_page_bootmem_info();
2223         free_all_bootmem();
2224 
2225         /*
2226          * Set up the zero page, mark it reserved, so that page count
2227          * is not manipulated when freeing the page from user ptes.
2228          */
2229         mem_map_zero = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0);
2230         if (mem_map_zero == NULL) {
2231                 prom_printf("paging_init: Cannot alloc zero page.\n");
2232                 prom_halt();
2233         }
2234         mark_page_reserved(mem_map_zero);
2235 
2236         mem_init_print_info(NULL);
2237 
2238         if (tlb_type == cheetah || tlb_type == cheetah_plus)
2239                 cheetah_ecache_flush_init();
2240 }
2241 
2242 void free_initmem(void)
2243 {
2244         unsigned long addr, initend;
2245         int do_free = 1;
2246 
2247         /* If the physical memory maps were trimmed by kernel command
2248          * line options, don't even try freeing this initmem stuff up.
2249          * The kernel image could have been in the trimmed out region
2250          * and if so the freeing below will free invalid page structs.
2251          */
2252         if (cmdline_memory_size)
2253                 do_free = 0;
2254 
2255         /*
2256          * The init section is aligned to 8k in vmlinux.lds. Page align for >8k pagesizes.
2257          */
2258         addr = PAGE_ALIGN((unsigned long)(__init_begin));
2259         initend = (unsigned long)(__init_end) & PAGE_MASK;
2260         for (; addr < initend; addr += PAGE_SIZE) {
2261                 unsigned long page;
2262 
2263                 page = (addr +
2264                         ((unsigned long) __va(kern_base)) -
2265                         ((unsigned long) KERNBASE));
2266                 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
2267 
2268                 if (do_free)
2269                         free_reserved_page(virt_to_page(page));
2270         }
2271 }
2272 
2273 #ifdef CONFIG_BLK_DEV_INITRD
2274 void free_initrd_mem(unsigned long start, unsigned long end)
2275 {
2276         free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
2277                            "initrd");
2278 }
2279 #endif
2280 
2281 #define _PAGE_CACHE_4U  (_PAGE_CP_4U | _PAGE_CV_4U)
2282 #define _PAGE_CACHE_4V  (_PAGE_CP_4V | _PAGE_CV_4V)
2283 #define __DIRTY_BITS_4U  (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U)
2284 #define __DIRTY_BITS_4V  (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V)
2285 #define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R)
2286 #define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R)
2287 
2288 pgprot_t PAGE_KERNEL __read_mostly;
2289 EXPORT_SYMBOL(PAGE_KERNEL);
2290 
2291 pgprot_t PAGE_KERNEL_LOCKED __read_mostly;
2292 pgprot_t PAGE_COPY __read_mostly;
2293 
2294 pgprot_t PAGE_SHARED __read_mostly;
2295 EXPORT_SYMBOL(PAGE_SHARED);
2296 
2297 unsigned long pg_iobits __read_mostly;
2298 
2299 unsigned long _PAGE_IE __read_mostly;
2300 EXPORT_SYMBOL(_PAGE_IE);
2301 
2302 unsigned long _PAGE_E __read_mostly;
2303 EXPORT_SYMBOL(_PAGE_E);
2304 
2305 unsigned long _PAGE_CACHE __read_mostly;
2306 EXPORT_SYMBOL(_PAGE_CACHE);
2307 
2308 #ifdef CONFIG_SPARSEMEM_VMEMMAP
2309 int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
2310                                int node)
2311 {
2312         unsigned long pte_base;
2313 
2314         pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U |
2315                     _PAGE_CP_4U | _PAGE_CV_4U |
2316                     _PAGE_P_4U | _PAGE_W_4U);
2317         if (tlb_type == hypervisor)
2318                 pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V |
2319                             _PAGE_CP_4V | _PAGE_CV_4V |
2320                             _PAGE_P_4V | _PAGE_W_4V);
2321 
2322         pte_base |= _PAGE_PMD_HUGE;
2323 
2324         vstart = vstart & PMD_MASK;
2325         vend = ALIGN(vend, PMD_SIZE);
2326         for (; vstart < vend; vstart += PMD_SIZE) {
2327                 pgd_t *pgd = pgd_offset_k(vstart);
2328                 unsigned long pte;
2329                 pud_t *pud;
2330                 pmd_t *pmd;
2331 
2332                 if (pgd_none(*pgd)) {
2333                         pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
2334 
2335                         if (!new)
2336                                 return -ENOMEM;
2337                         pgd_populate(&init_mm, pgd, new);
2338                 }
2339 
2340                 pud = pud_offset(pgd, vstart);
2341                 if (pud_none(*pud)) {
2342                         pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node);
2343 
2344                         if (!new)
2345                                 return -ENOMEM;
2346                         pud_populate(&init_mm, pud, new);
2347                 }
2348 
2349                 pmd = pmd_offset(pud, vstart);
2350 
2351                 pte = pmd_val(*pmd);
2352                 if (!(pte & _PAGE_VALID)) {
2353                         void *block = vmemmap_alloc_block(PMD_SIZE, node);
2354 
2355                         if (!block)
2356                                 return -ENOMEM;
2357 
2358                         pmd_val(*pmd) = pte_base | __pa(block);
2359                 }
2360         }
2361 
2362         return 0;
2363 }
2364 
2365 void vmemmap_free(unsigned long start, unsigned long end)
2366 {
2367 }
2368 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
2369 
2370 static void prot_init_common(unsigned long page_none,
2371                              unsigned long page_shared,
2372                              unsigned long page_copy,
2373                              unsigned long page_readonly,
2374                              unsigned long page_exec_bit)
2375 {
2376         PAGE_COPY = __pgprot(page_copy);
2377         PAGE_SHARED = __pgprot(page_shared);
2378 
2379         protection_map[0x0] = __pgprot(page_none);
2380         protection_map[0x1] = __pgprot(page_readonly & ~page_exec_bit);
2381         protection_map[0x2] = __pgprot(page_copy & ~page_exec_bit);
2382         protection_map[0x3] = __pgprot(page_copy & ~page_exec_bit);
2383         protection_map[0x4] = __pgprot(page_readonly);
2384         protection_map[0x5] = __pgprot(page_readonly);
2385         protection_map[0x6] = __pgprot(page_copy);
2386         protection_map[0x7] = __pgprot(page_copy);
2387         protection_map[0x8] = __pgprot(page_none);
2388         protection_map[0x9] = __pgprot(page_readonly & ~page_exec_bit);
2389         protection_map[0xa] = __pgprot(page_shared & ~page_exec_bit);
2390         protection_map[0xb] = __pgprot(page_shared & ~page_exec_bit);
2391         protection_map[0xc] = __pgprot(page_readonly);
2392         protection_map[0xd] = __pgprot(page_readonly);
2393         protection_map[0xe] = __pgprot(page_shared);
2394         protection_map[0xf] = __pgprot(page_shared);
2395 }
2396 
2397 static void __init sun4u_pgprot_init(void)
2398 {
2399         unsigned long page_none, page_shared, page_copy, page_readonly;
2400         unsigned long page_exec_bit;
2401         int i;
2402 
2403         PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID |
2404                                 _PAGE_CACHE_4U | _PAGE_P_4U |
2405                                 __ACCESS_BITS_4U | __DIRTY_BITS_4U |
2406                                 _PAGE_EXEC_4U);
2407         PAGE_KERNEL_LOCKED = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID |
2408                                        _PAGE_CACHE_4U | _PAGE_P_4U |
2409                                        __ACCESS_BITS_4U | __DIRTY_BITS_4U |
2410                                        _PAGE_EXEC_4U | _PAGE_L_4U);
2411 
2412         _PAGE_IE = _PAGE_IE_4U;
2413         _PAGE_E = _PAGE_E_4U;
2414         _PAGE_CACHE = _PAGE_CACHE_4U;
2415 
2416         pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4U | __DIRTY_BITS_4U |
2417                      __ACCESS_BITS_4U | _PAGE_E_4U);
2418 
2419 #ifdef CONFIG_DEBUG_PAGEALLOC
2420         kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET;
2421 #else
2422         kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^
2423                 PAGE_OFFSET;
2424 #endif
2425         kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U |
2426                                    _PAGE_P_4U | _PAGE_W_4U);
2427 
2428         for (i = 1; i < 4; i++)
2429                 kern_linear_pte_xor[i] = kern_linear_pte_xor[0];
2430 
2431         _PAGE_ALL_SZ_BITS =  (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U |
2432                               _PAGE_SZ64K_4U | _PAGE_SZ8K_4U |
2433                               _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U);
2434 
2435 
2436         page_none = _PAGE_PRESENT_4U | _PAGE_ACCESSED_4U | _PAGE_CACHE_4U;
2437         page_shared = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U |
2438                        __ACCESS_BITS_4U | _PAGE_WRITE_4U | _PAGE_EXEC_4U);
2439         page_copy   = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U |
2440                        __ACCESS_BITS_4U | _PAGE_EXEC_4U);
2441         page_readonly   = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U |
2442                            __ACCESS_BITS_4U | _PAGE_EXEC_4U);
2443 
2444         page_exec_bit = _PAGE_EXEC_4U;
2445 
2446         prot_init_common(page_none, page_shared, page_copy, page_readonly,
2447                          page_exec_bit);
2448 }
2449 
2450 static void __init sun4v_pgprot_init(void)
2451 {
2452         unsigned long page_none, page_shared, page_copy, page_readonly;
2453         unsigned long page_exec_bit;
2454         int i;
2455 
2456         PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID |
2457                                 _PAGE_CACHE_4V | _PAGE_P_4V |
2458                                 __ACCESS_BITS_4V | __DIRTY_BITS_4V |
2459                                 _PAGE_EXEC_4V);
2460         PAGE_KERNEL_LOCKED = PAGE_KERNEL;
2461 
2462         _PAGE_IE = _PAGE_IE_4V;
2463         _PAGE_E = _PAGE_E_4V;
2464         _PAGE_CACHE = _PAGE_CACHE_4V;
2465 
2466 #ifdef CONFIG_DEBUG_PAGEALLOC
2467         kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET;
2468 #else
2469         kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^
2470                 PAGE_OFFSET;
2471 #endif
2472         kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V |
2473                                    _PAGE_P_4V | _PAGE_W_4V);
2474 
2475         for (i = 1; i < 4; i++)
2476                 kern_linear_pte_xor[i] = kern_linear_pte_xor[0];
2477 
2478         pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V |
2479                      __ACCESS_BITS_4V | _PAGE_E_4V);
2480 
2481         _PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V |
2482                              _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V |
2483                              _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V |
2484                              _PAGE_SZ64K_4V | _PAGE_SZ8K_4V);
2485 
2486         page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | _PAGE_CACHE_4V;
2487         page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
2488                        __ACCESS_BITS_4V | _PAGE_WRITE_4V | _PAGE_EXEC_4V);
2489         page_copy   = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
2490                        __ACCESS_BITS_4V | _PAGE_EXEC_4V);
2491         page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
2492                          __ACCESS_BITS_4V | _PAGE_EXEC_4V);
2493 
2494         page_exec_bit = _PAGE_EXEC_4V;
2495 
2496         prot_init_common(page_none, page_shared, page_copy, page_readonly,
2497                          page_exec_bit);
2498 }
2499 
2500 unsigned long pte_sz_bits(unsigned long sz)
2501 {
2502         if (tlb_type == hypervisor) {
2503                 switch (sz) {
2504                 case 8 * 1024:
2505                 default:
2506                         return _PAGE_SZ8K_4V;
2507                 case 64 * 1024:
2508                         return _PAGE_SZ64K_4V;
2509                 case 512 * 1024:
2510                         return _PAGE_SZ512K_4V;
2511                 case 4 * 1024 * 1024:
2512                         return _PAGE_SZ4MB_4V;
2513                 }
2514         } else {
2515                 switch (sz) {
2516                 case 8 * 1024:
2517                 default:
2518                         return _PAGE_SZ8K_4U;
2519                 case 64 * 1024:
2520                         return _PAGE_SZ64K_4U;
2521                 case 512 * 1024:
2522                         return _PAGE_SZ512K_4U;
2523                 case 4 * 1024 * 1024:
2524                         return _PAGE_SZ4MB_4U;
2525                 }
2526         }
2527 }
2528 
2529 pte_t mk_pte_io(unsigned long page, pgprot_t prot, int space, unsigned long page_size)
2530 {
2531         pte_t pte;
2532 
2533         pte_val(pte)  = page | pgprot_val(pgprot_noncached(prot));
2534         pte_val(pte) |= (((unsigned long)space) << 32);
2535         pte_val(pte) |= pte_sz_bits(page_size);
2536 
2537         return pte;
2538 }
2539 
2540 static unsigned long kern_large_tte(unsigned long paddr)
2541 {
2542         unsigned long val;
2543 
2544         val = (_PAGE_VALID | _PAGE_SZ4MB_4U |
2545                _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_P_4U |
2546                _PAGE_EXEC_4U | _PAGE_L_4U | _PAGE_W_4U);
2547         if (tlb_type == hypervisor)
2548                 val = (_PAGE_VALID | _PAGE_SZ4MB_4V |
2549                        _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V |
2550                        _PAGE_EXEC_4V | _PAGE_W_4V);
2551 
2552         return val | paddr;
2553 }
2554 
2555 /* If not locked, zap it. */
2556 void __flush_tlb_all(void)
2557 {
2558         unsigned long pstate;
2559         int i;
2560 
2561         __asm__ __volatile__("flushw\n\t"
2562                              "rdpr      %%pstate, %0\n\t"
2563                              "wrpr      %0, %1, %%pstate"
2564                              : "=r" (pstate)
2565                              : "i" (PSTATE_IE));
2566         if (tlb_type == hypervisor) {
2567                 sun4v_mmu_demap_all();
2568         } else if (tlb_type == spitfire) {
2569                 for (i = 0; i < 64; i++) {
2570                         /* Spitfire Errata #32 workaround */
2571                         /* NOTE: Always runs on spitfire, so no
2572                          *       cheetah+ page size encodings.
2573                          */
2574                         __asm__ __volatile__("stxa      %0, [%1] %2\n\t"
2575                                              "flush     %%g6"
2576                                              : /* No outputs */
2577                                              : "r" (0),
2578                                              "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
2579 
2580                         if (!(spitfire_get_dtlb_data(i) & _PAGE_L_4U)) {
2581                                 __asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
2582                                                      "membar #Sync"
2583                                                      : /* no outputs */
2584                                                      : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
2585                                 spitfire_put_dtlb_data(i, 0x0UL);
2586                         }
2587 
2588                         /* Spitfire Errata #32 workaround */
2589                         /* NOTE: Always runs on spitfire, so no
2590                          *       cheetah+ page size encodings.
2591                          */
2592                         __asm__ __volatile__("stxa      %0, [%1] %2\n\t"
2593                                              "flush     %%g6"
2594                                              : /* No outputs */
2595                                              : "r" (0),
2596                                              "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
2597 
2598                         if (!(spitfire_get_itlb_data(i) & _PAGE_L_4U)) {
2599                                 __asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
2600                                                      "membar #Sync"
2601                                                      : /* no outputs */
2602                                                      : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU));
2603                                 spitfire_put_itlb_data(i, 0x0UL);
2604                         }
2605                 }
2606         } else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
2607                 cheetah_flush_dtlb_all();
2608                 cheetah_flush_itlb_all();
2609         }
2610         __asm__ __volatile__("wrpr      %0, 0, %%pstate"
2611                              : : "r" (pstate));
2612 }
2613 
2614 pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
2615                             unsigned long address)
2616 {
2617         struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
2618                                        __GFP_REPEAT | __GFP_ZERO);
2619         pte_t *pte = NULL;
2620 
2621         if (page)
2622                 pte = (pte_t *) page_address(page);
2623 
2624         return pte;
2625 }
2626 
2627 pgtable_t pte_alloc_one(struct mm_struct *mm,
2628                         unsigned long address)
2629 {
2630         struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
2631                                        __GFP_REPEAT | __GFP_ZERO);
2632         pte_t *pte = NULL;
2633 
2634         if (page) {
2635                 pgtable_page_ctor(page);
2636                 pte = (pte_t *) page_address(page);
2637         }
2638 
2639         return pte;
2640 }
2641 
2642 void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
2643 {
2644         free_page((unsigned long)pte);
2645 }
2646 
2647 static void __pte_free(pgtable_t pte)
2648 {
2649         struct page *page = virt_to_page(pte);
2650 
2651         pgtable_page_dtor(page);
2652         __free_page(page);
2653 }
2654 
2655 void pte_free(struct mm_struct *mm, pgtable_t pte)
2656 {
2657         __pte_free(pte);
2658 }
2659 
2660 void pgtable_free(void *table, bool is_page)
2661 {
2662         if (is_page)
2663                 __pte_free(table);
2664         else
2665                 kmem_cache_free(pgtable_cache, table);
2666 }
2667 
2668 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2669 void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
2670                           pmd_t *pmd)
2671 {
2672         unsigned long pte, flags;
2673         struct mm_struct *mm;
2674         pmd_t entry = *pmd;
2675 
2676         if (!pmd_large(entry) || !pmd_young(entry))
2677                 return;
2678 
2679         pte = pmd_val(entry);
2680 
2681         /* We are fabricating 8MB pages using 4MB real hw pages.  */
2682         pte |= (addr & (1UL << REAL_HPAGE_SHIFT));
2683 
2684         mm = vma->vm_mm;
2685 
2686         spin_lock_irqsave(&mm->context.lock, flags);
2687 
2688         if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL)
2689                 __update_mmu_tsb_insert(mm, MM_TSB_HUGE, REAL_HPAGE_SHIFT,
2690                                         addr, pte);
2691 
2692         spin_unlock_irqrestore(&mm->context.lock, flags);
2693 }
2694 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2695 
2696 #if defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)
2697 static void context_reload(void *__data)
2698 {
2699         struct mm_struct *mm = __data;
2700 
2701         if (mm == current->mm)
2702                 load_secondary_context(mm);
2703 }
2704 
2705 void hugetlb_setup(struct pt_regs *regs)
2706 {
2707         struct mm_struct *mm = current->mm;
2708         struct tsb_config *tp;
2709 
2710         if (in_atomic() || !mm) {
2711                 const struct exception_table_entry *entry;
2712 
2713                 entry = search_exception_tables(regs->tpc);
2714                 if (entry) {
2715                         regs->tpc = entry->fixup;
2716                         regs->tnpc = regs->tpc + 4;
2717                         return;
2718                 }
2719                 pr_alert("Unexpected HugeTLB setup in atomic context.\n");
2720                 die_if_kernel("HugeTSB in atomic", regs);
2721         }
2722 
2723         tp = &mm->context.tsb_block[MM_TSB_HUGE];
2724         if (likely(tp->tsb == NULL))
2725                 tsb_grow(mm, MM_TSB_HUGE, 0);
2726 
2727         tsb_context_switch(mm);
2728         smp_tsb_sync(mm);
2729 
2730         /* On UltraSPARC-III+ and later, configure the second half of
2731          * the Data-TLB for huge pages.
2732          */
2733         if (tlb_type == cheetah_plus) {
2734                 bool need_context_reload = false;
2735                 unsigned long ctx;
2736 
2737                 spin_lock_irq(&ctx_alloc_lock);
2738                 ctx = mm->context.sparc64_ctx_val;
2739                 ctx &= ~CTX_PGSZ_MASK;
2740                 ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT;
2741                 ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT;
2742 
2743                 if (ctx != mm->context.sparc64_ctx_val) {
2744                         /* When changing the page size fields, we
2745                          * must perform a context flush so that no
2746                          * stale entries match.  This flush must
2747                          * occur with the original context register
2748                          * settings.
2749                          */
2750                         do_flush_tlb_mm(mm);
2751 
2752                         /* Reload the context register of all processors
2753                          * also executing in this address space.
2754                          */
2755                         mm->context.sparc64_ctx_val = ctx;
2756                         need_context_reload = true;
2757                 }
2758                 spin_unlock_irq(&ctx_alloc_lock);
2759 
2760                 if (need_context_reload)
2761                         on_each_cpu(context_reload, mm, 0);
2762         }
2763 }
2764 #endif
2765 
2766 #ifdef CONFIG_SMP
2767 #define do_flush_tlb_kernel_range       smp_flush_tlb_kernel_range
2768 #else
2769 #define do_flush_tlb_kernel_range       __flush_tlb_kernel_range
2770 #endif
2771 
2772 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
2773 {
2774         if (start < HI_OBP_ADDRESS && end > LOW_OBP_ADDRESS) {
2775                 if (start < LOW_OBP_ADDRESS) {
2776                         flush_tsb_kernel_range(start, LOW_OBP_ADDRESS);
2777                         do_flush_tlb_kernel_range(start, LOW_OBP_ADDRESS);
2778                 }
2779                 if (end > HI_OBP_ADDRESS) {
2780                         flush_tsb_kernel_range(HI_OBP_ADDRESS, end);
2781                         do_flush_tlb_kernel_range(HI_OBP_ADDRESS, end);
2782                 }
2783         } else {
2784                 flush_tsb_kernel_range(start, end);
2785                 do_flush_tlb_kernel_range(start, end);
2786         }
2787 }
2788 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp