GNU Linux-libre 4.19.286-gnu1
[releases.git] / arch / powerpc / mm / hugetlbpage.c
1 /*
2  * PPC Huge TLB Page Support for Kernel.
3  *
4  * Copyright (C) 2003 David Gibson, IBM Corporation.
5  * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
6  *
7  * Based on the IA-32 version:
8  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
9  */
10
11 #include <linux/mm.h>
12 #include <linux/io.h>
13 #include <linux/slab.h>
14 #include <linux/hugetlb.h>
15 #include <linux/export.h>
16 #include <linux/of_fdt.h>
17 #include <linux/memblock.h>
18 #include <linux/bootmem.h>
19 #include <linux/moduleparam.h>
20 #include <linux/swap.h>
21 #include <linux/swapops.h>
22 #include <linux/kmemleak.h>
23 #include <asm/pgtable.h>
24 #include <asm/pgalloc.h>
25 #include <asm/tlb.h>
26 #include <asm/setup.h>
27 #include <asm/hugetlb.h>
28 #include <asm/pte-walk.h>
29
30
31 #ifdef CONFIG_HUGETLB_PAGE
32
33 #define PAGE_SHIFT_64K  16
34 #define PAGE_SHIFT_512K 19
35 #define PAGE_SHIFT_8M   23
36 #define PAGE_SHIFT_16M  24
37 #define PAGE_SHIFT_16G  34
38
39 bool hugetlb_disabled = false;
40
41 unsigned int HPAGE_SHIFT;
42 EXPORT_SYMBOL(HPAGE_SHIFT);
43
44 #define hugepd_none(hpd)        (hpd_val(hpd) == 0)
45
46 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
47 {
48         /*
49          * Only called for hugetlbfs pages, hence can ignore THP and the
50          * irq disabled walk.
51          */
52         return __find_linux_pte(mm->pgd, addr, NULL, NULL);
53 }
54
55 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
56                            unsigned long address, unsigned int pdshift,
57                            unsigned int pshift, spinlock_t *ptl)
58 {
59         struct kmem_cache *cachep;
60         pte_t *new;
61         int i;
62         int num_hugepd;
63
64         if (pshift >= pdshift) {
65                 cachep = hugepte_cache;
66                 num_hugepd = 1 << (pshift - pdshift);
67         } else {
68                 cachep = PGT_CACHE(pdshift - pshift);
69                 num_hugepd = 1;
70         }
71
72         new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
73
74         BUG_ON(pshift > HUGEPD_SHIFT_MASK);
75         BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
76
77         if (! new)
78                 return -ENOMEM;
79
80         /*
81          * Make sure other cpus find the hugepd set only after a
82          * properly initialized page table is visible to them.
83          * For more details look for comment in __pte_alloc().
84          */
85         smp_wmb();
86
87         spin_lock(ptl);
88         /*
89          * We have multiple higher-level entries that point to the same
90          * actual pte location.  Fill in each as we go and backtrack on error.
91          * We need all of these so the DTLB pgtable walk code can find the
92          * right higher-level entry without knowing if it's a hugepage or not.
93          */
94         for (i = 0; i < num_hugepd; i++, hpdp++) {
95                 if (unlikely(!hugepd_none(*hpdp)))
96                         break;
97                 else {
98 #ifdef CONFIG_PPC_BOOK3S_64
99                         *hpdp = __hugepd(__pa(new) |
100                                          (shift_to_mmu_psize(pshift) << 2));
101 #elif defined(CONFIG_PPC_8xx)
102                         *hpdp = __hugepd(__pa(new) | _PMD_USER |
103                                          (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
104                                           _PMD_PAGE_512K) | _PMD_PRESENT);
105 #else
106                         /* We use the old format for PPC_FSL_BOOK3E */
107                         *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
108 #endif
109                 }
110         }
111         /* If we bailed from the for loop early, an error occurred, clean up */
112         if (i < num_hugepd) {
113                 for (i = i - 1 ; i >= 0; i--, hpdp--)
114                         *hpdp = __hugepd(0);
115                 kmem_cache_free(cachep, new);
116         } else {
117                 kmemleak_ignore(new);
118         }
119         spin_unlock(ptl);
120         return 0;
121 }
122
123 /*
124  * At this point we do the placement change only for BOOK3S 64. This would
125  * possibly work on other subarchs.
126  */
127 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
128 {
129         pgd_t *pg;
130         pud_t *pu;
131         pmd_t *pm;
132         hugepd_t *hpdp = NULL;
133         unsigned pshift = __ffs(sz);
134         unsigned pdshift = PGDIR_SHIFT;
135         spinlock_t *ptl;
136
137         addr &= ~(sz-1);
138         pg = pgd_offset(mm, addr);
139
140 #ifdef CONFIG_PPC_BOOK3S_64
141         if (pshift == PGDIR_SHIFT)
142                 /* 16GB huge page */
143                 return (pte_t *) pg;
144         else if (pshift > PUD_SHIFT) {
145                 /*
146                  * We need to use hugepd table
147                  */
148                 ptl = &mm->page_table_lock;
149                 hpdp = (hugepd_t *)pg;
150         } else {
151                 pdshift = PUD_SHIFT;
152                 pu = pud_alloc(mm, pg, addr);
153                 if (!pu)
154                         return NULL;
155                 if (pshift == PUD_SHIFT)
156                         return (pte_t *)pu;
157                 else if (pshift > PMD_SHIFT) {
158                         ptl = pud_lockptr(mm, pu);
159                         hpdp = (hugepd_t *)pu;
160                 } else {
161                         pdshift = PMD_SHIFT;
162                         pm = pmd_alloc(mm, pu, addr);
163                         if (!pm)
164                                 return NULL;
165                         if (pshift == PMD_SHIFT)
166                                 /* 16MB hugepage */
167                                 return (pte_t *)pm;
168                         else {
169                                 ptl = pmd_lockptr(mm, pm);
170                                 hpdp = (hugepd_t *)pm;
171                         }
172                 }
173         }
174 #else
175         if (pshift >= PGDIR_SHIFT) {
176                 ptl = &mm->page_table_lock;
177                 hpdp = (hugepd_t *)pg;
178         } else {
179                 pdshift = PUD_SHIFT;
180                 pu = pud_alloc(mm, pg, addr);
181                 if (!pu)
182                         return NULL;
183                 if (pshift >= PUD_SHIFT) {
184                         ptl = pud_lockptr(mm, pu);
185                         hpdp = (hugepd_t *)pu;
186                 } else {
187                         pdshift = PMD_SHIFT;
188                         pm = pmd_alloc(mm, pu, addr);
189                         if (!pm)
190                                 return NULL;
191                         ptl = pmd_lockptr(mm, pm);
192                         hpdp = (hugepd_t *)pm;
193                 }
194         }
195 #endif
196         if (!hpdp)
197                 return NULL;
198
199         BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
200
201         if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
202                                                   pdshift, pshift, ptl))
203                 return NULL;
204
205         return hugepte_offset(*hpdp, addr, pdshift);
206 }
207
208 #ifdef CONFIG_PPC_BOOK3S_64
209 /*
210  * Tracks gpages after the device tree is scanned and before the
211  * huge_boot_pages list is ready on pseries.
212  */
213 #define MAX_NUMBER_GPAGES       1024
214 __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
215 __initdata static unsigned nr_gpages;
216
217 /*
218  * Build list of addresses of gigantic pages.  This function is used in early
219  * boot before the buddy allocator is setup.
220  */
221 void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
222 {
223         if (!addr)
224                 return;
225         while (number_of_pages > 0) {
226                 gpage_freearray[nr_gpages] = addr;
227                 nr_gpages++;
228                 number_of_pages--;
229                 addr += page_size;
230         }
231 }
232
233 int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
234 {
235         struct huge_bootmem_page *m;
236         if (nr_gpages == 0)
237                 return 0;
238         m = phys_to_virt(gpage_freearray[--nr_gpages]);
239         gpage_freearray[nr_gpages] = 0;
240         list_add(&m->list, &huge_boot_pages);
241         m->hstate = hstate;
242         return 1;
243 }
244 #endif
245
246
247 int __init alloc_bootmem_huge_page(struct hstate *h)
248 {
249
250 #ifdef CONFIG_PPC_BOOK3S_64
251         if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
252                 return pseries_alloc_bootmem_huge_page(h);
253 #endif
254         return __alloc_bootmem_huge_page(h);
255 }
256
257 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
258 #define HUGEPD_FREELIST_SIZE \
259         ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
260
261 struct hugepd_freelist {
262         struct rcu_head rcu;
263         unsigned int index;
264         void *ptes[0];
265 };
266
267 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
268
269 static void hugepd_free_rcu_callback(struct rcu_head *head)
270 {
271         struct hugepd_freelist *batch =
272                 container_of(head, struct hugepd_freelist, rcu);
273         unsigned int i;
274
275         for (i = 0; i < batch->index; i++)
276                 kmem_cache_free(hugepte_cache, batch->ptes[i]);
277
278         free_page((unsigned long)batch);
279 }
280
281 static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
282 {
283         struct hugepd_freelist **batchp;
284
285         batchp = &get_cpu_var(hugepd_freelist_cur);
286
287         if (atomic_read(&tlb->mm->mm_users) < 2 ||
288             mm_is_thread_local(tlb->mm)) {
289                 kmem_cache_free(hugepte_cache, hugepte);
290                 put_cpu_var(hugepd_freelist_cur);
291                 return;
292         }
293
294         if (*batchp == NULL) {
295                 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
296                 (*batchp)->index = 0;
297         }
298
299         (*batchp)->ptes[(*batchp)->index++] = hugepte;
300         if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
301                 call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
302                 *batchp = NULL;
303         }
304         put_cpu_var(hugepd_freelist_cur);
305 }
306 #else
307 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
308 #endif
309
310 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
311                               unsigned long start, unsigned long end,
312                               unsigned long floor, unsigned long ceiling)
313 {
314         pte_t *hugepte = hugepd_page(*hpdp);
315         int i;
316
317         unsigned long pdmask = ~((1UL << pdshift) - 1);
318         unsigned int num_hugepd = 1;
319         unsigned int shift = hugepd_shift(*hpdp);
320
321         /* Note: On fsl the hpdp may be the first of several */
322         if (shift > pdshift)
323                 num_hugepd = 1 << (shift - pdshift);
324
325         start &= pdmask;
326         if (start < floor)
327                 return;
328         if (ceiling) {
329                 ceiling &= pdmask;
330                 if (! ceiling)
331                         return;
332         }
333         if (end - 1 > ceiling - 1)
334                 return;
335
336         for (i = 0; i < num_hugepd; i++, hpdp++)
337                 *hpdp = __hugepd(0);
338
339         if (shift >= pdshift)
340                 hugepd_free(tlb, hugepte);
341         else
342                 pgtable_free_tlb(tlb, hugepte,
343                                  get_hugepd_cache_index(pdshift - shift));
344 }
345
346 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
347                                    unsigned long addr, unsigned long end,
348                                    unsigned long floor, unsigned long ceiling)
349 {
350         pmd_t *pmd;
351         unsigned long next;
352         unsigned long start;
353
354         start = addr;
355         do {
356                 unsigned long more;
357
358                 pmd = pmd_offset(pud, addr);
359                 next = pmd_addr_end(addr, end);
360                 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
361                         /*
362                          * if it is not hugepd pointer, we should already find
363                          * it cleared.
364                          */
365                         WARN_ON(!pmd_none_or_clear_bad(pmd));
366                         continue;
367                 }
368                 /*
369                  * Increment next by the size of the huge mapping since
370                  * there may be more than one entry at this level for a
371                  * single hugepage, but all of them point to
372                  * the same kmem cache that holds the hugepte.
373                  */
374                 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
375                 if (more > next)
376                         next = more;
377
378                 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
379                                   addr, next, floor, ceiling);
380         } while (addr = next, addr != end);
381
382         start &= PUD_MASK;
383         if (start < floor)
384                 return;
385         if (ceiling) {
386                 ceiling &= PUD_MASK;
387                 if (!ceiling)
388                         return;
389         }
390         if (end - 1 > ceiling - 1)
391                 return;
392
393         pmd = pmd_offset(pud, start);
394         pud_clear(pud);
395         pmd_free_tlb(tlb, pmd, start);
396         mm_dec_nr_pmds(tlb->mm);
397 }
398
399 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
400                                    unsigned long addr, unsigned long end,
401                                    unsigned long floor, unsigned long ceiling)
402 {
403         pud_t *pud;
404         unsigned long next;
405         unsigned long start;
406
407         start = addr;
408         do {
409                 pud = pud_offset(pgd, addr);
410                 next = pud_addr_end(addr, end);
411                 if (!is_hugepd(__hugepd(pud_val(*pud)))) {
412                         if (pud_none_or_clear_bad(pud))
413                                 continue;
414                         hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
415                                                ceiling);
416                 } else {
417                         unsigned long more;
418                         /*
419                          * Increment next by the size of the huge mapping since
420                          * there may be more than one entry at this level for a
421                          * single hugepage, but all of them point to
422                          * the same kmem cache that holds the hugepte.
423                          */
424                         more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
425                         if (more > next)
426                                 next = more;
427
428                         free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
429                                           addr, next, floor, ceiling);
430                 }
431         } while (addr = next, addr != end);
432
433         start &= PGDIR_MASK;
434         if (start < floor)
435                 return;
436         if (ceiling) {
437                 ceiling &= PGDIR_MASK;
438                 if (!ceiling)
439                         return;
440         }
441         if (end - 1 > ceiling - 1)
442                 return;
443
444         pud = pud_offset(pgd, start);
445         pgd_clear(pgd);
446         pud_free_tlb(tlb, pud, start);
447         mm_dec_nr_puds(tlb->mm);
448 }
449
450 /*
451  * This function frees user-level page tables of a process.
452  */
453 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
454                             unsigned long addr, unsigned long end,
455                             unsigned long floor, unsigned long ceiling)
456 {
457         pgd_t *pgd;
458         unsigned long next;
459
460         /*
461          * Because there are a number of different possible pagetable
462          * layouts for hugepage ranges, we limit knowledge of how
463          * things should be laid out to the allocation path
464          * (huge_pte_alloc(), above).  Everything else works out the
465          * structure as it goes from information in the hugepd
466          * pointers.  That means that we can't here use the
467          * optimization used in the normal page free_pgd_range(), of
468          * checking whether we're actually covering a large enough
469          * range to have to do anything at the top level of the walk
470          * instead of at the bottom.
471          *
472          * To make sense of this, you should probably go read the big
473          * block comment at the top of the normal free_pgd_range(),
474          * too.
475          */
476
477         do {
478                 next = pgd_addr_end(addr, end);
479                 pgd = pgd_offset(tlb->mm, addr);
480                 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
481                         if (pgd_none_or_clear_bad(pgd))
482                                 continue;
483                         hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
484                 } else {
485                         unsigned long more;
486                         /*
487                          * Increment next by the size of the huge mapping since
488                          * there may be more than one entry at the pgd level
489                          * for a single hugepage, but all of them point to the
490                          * same kmem cache that holds the hugepte.
491                          */
492                         more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
493                         if (more > next)
494                                 next = more;
495
496                         free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
497                                           addr, next, floor, ceiling);
498                 }
499         } while (addr = next, addr != end);
500 }
501
502 struct page *follow_huge_pd(struct vm_area_struct *vma,
503                             unsigned long address, hugepd_t hpd,
504                             int flags, int pdshift)
505 {
506         pte_t *ptep;
507         spinlock_t *ptl;
508         struct page *page = NULL;
509         unsigned long mask;
510         int shift = hugepd_shift(hpd);
511         struct mm_struct *mm = vma->vm_mm;
512
513 retry:
514         /*
515          * hugepage directory entries are protected by mm->page_table_lock
516          * Use this instead of huge_pte_lockptr
517          */
518         ptl = &mm->page_table_lock;
519         spin_lock(ptl);
520
521         ptep = hugepte_offset(hpd, address, pdshift);
522         if (pte_present(*ptep)) {
523                 mask = (1UL << shift) - 1;
524                 page = pte_page(*ptep);
525                 page += ((address & mask) >> PAGE_SHIFT);
526                 if (flags & FOLL_GET)
527                         get_page(page);
528         } else {
529                 if (is_hugetlb_entry_migration(*ptep)) {
530                         spin_unlock(ptl);
531                         __migration_entry_wait(mm, ptep, ptl);
532                         goto retry;
533                 }
534         }
535         spin_unlock(ptl);
536         return page;
537 }
538
539 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
540                                       unsigned long sz)
541 {
542         unsigned long __boundary = (addr + sz) & ~(sz-1);
543         return (__boundary - 1 < end - 1) ? __boundary : end;
544 }
545
546 int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
547                 unsigned long end, int write, struct page **pages, int *nr)
548 {
549         pte_t *ptep;
550         unsigned long sz = 1UL << hugepd_shift(hugepd);
551         unsigned long next;
552
553         ptep = hugepte_offset(hugepd, addr, pdshift);
554         do {
555                 next = hugepte_addr_end(addr, end, sz);
556                 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
557                         return 0;
558         } while (ptep++, addr = next, addr != end);
559
560         return 1;
561 }
562
563 #ifdef CONFIG_PPC_MM_SLICES
564 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
565                                         unsigned long len, unsigned long pgoff,
566                                         unsigned long flags)
567 {
568         struct hstate *hstate = hstate_file(file);
569         int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
570
571 #ifdef CONFIG_PPC_RADIX_MMU
572         if (radix_enabled())
573                 return radix__hugetlb_get_unmapped_area(file, addr, len,
574                                                        pgoff, flags);
575 #endif
576         return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
577 }
578 #endif
579
580 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
581 {
582 #ifdef CONFIG_PPC_MM_SLICES
583         /* With radix we don't use slice, so derive it from vma*/
584         if (!radix_enabled()) {
585                 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
586
587                 return 1UL << mmu_psize_to_shift(psize);
588         }
589 #endif
590         return vma_kernel_pagesize(vma);
591 }
592
593 static inline bool is_power_of_4(unsigned long x)
594 {
595         if (is_power_of_2(x))
596                 return (__ilog2(x) % 2) ? false : true;
597         return false;
598 }
599
600 static int __init add_huge_page_size(unsigned long long size)
601 {
602         int shift = __ffs(size);
603         int mmu_psize;
604
605         /* Check that it is a page size supported by the hardware and
606          * that it fits within pagetable and slice limits. */
607         if (size <= PAGE_SIZE)
608                 return -EINVAL;
609 #if defined(CONFIG_PPC_FSL_BOOK3E)
610         if (!is_power_of_4(size))
611                 return -EINVAL;
612 #elif !defined(CONFIG_PPC_8xx)
613         if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
614                 return -EINVAL;
615 #endif
616
617         if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
618                 return -EINVAL;
619
620 #ifdef CONFIG_PPC_BOOK3S_64
621         /*
622          * We need to make sure that for different page sizes reported by
623          * firmware we only add hugetlb support for page sizes that can be
624          * supported by linux page table layout.
625          * For now we have
626          * Radix: 2M and 1G
627          * Hash: 16M and 16G
628          */
629         if (radix_enabled()) {
630                 if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G)
631                         return -EINVAL;
632         } else {
633                 if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
634                         return -EINVAL;
635         }
636 #endif
637
638         BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
639
640         /* Return if huge page size has already been setup */
641         if (size_to_hstate(size))
642                 return 0;
643
644         hugetlb_add_hstate(shift - PAGE_SHIFT);
645
646         return 0;
647 }
648
649 static int __init hugepage_setup_sz(char *str)
650 {
651         unsigned long long size;
652
653         size = memparse(str, &str);
654
655         if (add_huge_page_size(size) != 0) {
656                 hugetlb_bad_size();
657                 pr_err("Invalid huge page size specified(%llu)\n", size);
658         }
659
660         return 1;
661 }
662 __setup("hugepagesz=", hugepage_setup_sz);
663
664 struct kmem_cache *hugepte_cache;
665 static int __init hugetlbpage_init(void)
666 {
667         int psize;
668
669         if (hugetlb_disabled) {
670                 pr_info("HugeTLB support is disabled!\n");
671                 return 0;
672         }
673
674 #if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
675         if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
676                 return -ENODEV;
677 #endif
678         for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
679                 unsigned shift;
680                 unsigned pdshift;
681
682                 if (!mmu_psize_defs[psize].shift)
683                         continue;
684
685                 shift = mmu_psize_to_shift(psize);
686
687 #ifdef CONFIG_PPC_BOOK3S_64
688                 if (shift > PGDIR_SHIFT)
689                         continue;
690                 else if (shift > PUD_SHIFT)
691                         pdshift = PGDIR_SHIFT;
692                 else if (shift > PMD_SHIFT)
693                         pdshift = PUD_SHIFT;
694                 else
695                         pdshift = PMD_SHIFT;
696 #else
697                 if (shift < PUD_SHIFT)
698                         pdshift = PMD_SHIFT;
699                 else if (shift < PGDIR_SHIFT)
700                         pdshift = PUD_SHIFT;
701                 else
702                         pdshift = PGDIR_SHIFT;
703 #endif
704
705                 if (add_huge_page_size(1ULL << shift) < 0)
706                         continue;
707                 /*
708                  * if we have pdshift and shift value same, we don't
709                  * use pgt cache for hugepd.
710                  */
711                 if (pdshift > shift)
712                         pgtable_cache_add(pdshift - shift, NULL);
713 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
714                 else if (!hugepte_cache) {
715                         /*
716                          * Create a kmem cache for hugeptes.  The bottom bits in
717                          * the pte have size information encoded in them, so
718                          * align them to allow this
719                          */
720                         hugepte_cache = kmem_cache_create("hugepte-cache",
721                                                           sizeof(pte_t),
722                                                           HUGEPD_SHIFT_MASK + 1,
723                                                           0, NULL);
724                         if (hugepte_cache == NULL)
725                                 panic("%s: Unable to create kmem cache "
726                                       "for hugeptes\n", __func__);
727
728                 }
729 #endif
730         }
731
732 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
733         /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
734         if (mmu_psize_defs[MMU_PAGE_4M].shift)
735                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
736         else if (mmu_psize_defs[MMU_PAGE_512K].shift)
737                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
738 #else
739         /* Set default large page size. Currently, we pick 16M or 1M
740          * depending on what is available
741          */
742         if (mmu_psize_defs[MMU_PAGE_16M].shift)
743                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
744         else if (mmu_psize_defs[MMU_PAGE_1M].shift)
745                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
746         else if (mmu_psize_defs[MMU_PAGE_2M].shift)
747                 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
748 #endif
749         return 0;
750 }
751
752 arch_initcall(hugetlbpage_init);
753
754 void flush_dcache_icache_hugepage(struct page *page)
755 {
756         int i;
757         void *start;
758
759         BUG_ON(!PageCompound(page));
760
761         for (i = 0; i < (1UL << compound_order(page)); i++) {
762                 if (!PageHighMem(page)) {
763                         __flush_dcache_icache(page_address(page+i));
764                 } else {
765                         start = kmap_atomic(page+i);
766                         __flush_dcache_icache(start);
767                         kunmap_atomic(start);
768                 }
769         }
770 }
771
772 #endif /* CONFIG_HUGETLB_PAGE */
773
774 /*
775  * We have 4 cases for pgds and pmds:
776  * (1) invalid (all zeroes)
777  * (2) pointer to next table, as normal; bottom 6 bits == 0
778  * (3) leaf pte for huge page _PAGE_PTE set
779  * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
780  *
781  * So long as we atomically load page table pointers we are safe against teardown,
782  * we can follow the address down to the the page and take a ref on it.
783  * This function need to be called with interrupts disabled. We use this variant
784  * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
785  */
786 pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
787                         bool *is_thp, unsigned *hpage_shift)
788 {
789         pgd_t pgd, *pgdp;
790         pud_t pud, *pudp;
791         pmd_t pmd, *pmdp;
792         pte_t *ret_pte;
793         hugepd_t *hpdp = NULL;
794         unsigned pdshift = PGDIR_SHIFT;
795
796         if (hpage_shift)
797                 *hpage_shift = 0;
798
799         if (is_thp)
800                 *is_thp = false;
801
802         pgdp = pgdir + pgd_index(ea);
803         pgd  = READ_ONCE(*pgdp);
804         /*
805          * Always operate on the local stack value. This make sure the
806          * value don't get updated by a parallel THP split/collapse,
807          * page fault or a page unmap. The return pte_t * is still not
808          * stable. So should be checked there for above conditions.
809          */
810         if (pgd_none(pgd))
811                 return NULL;
812         else if (pgd_huge(pgd)) {
813                 ret_pte = (pte_t *) pgdp;
814                 goto out;
815         } else if (is_hugepd(__hugepd(pgd_val(pgd))))
816                 hpdp = (hugepd_t *)&pgd;
817         else {
818                 /*
819                  * Even if we end up with an unmap, the pgtable will not
820                  * be freed, because we do an rcu free and here we are
821                  * irq disabled
822                  */
823                 pdshift = PUD_SHIFT;
824                 pudp = pud_offset(&pgd, ea);
825                 pud  = READ_ONCE(*pudp);
826
827                 if (pud_none(pud))
828                         return NULL;
829                 else if (pud_huge(pud)) {
830                         ret_pte = (pte_t *) pudp;
831                         goto out;
832                 } else if (is_hugepd(__hugepd(pud_val(pud))))
833                         hpdp = (hugepd_t *)&pud;
834                 else {
835                         pdshift = PMD_SHIFT;
836                         pmdp = pmd_offset(&pud, ea);
837                         pmd  = READ_ONCE(*pmdp);
838                         /*
839                          * A hugepage collapse is captured by pmd_none, because
840                          * it mark the pmd none and do a hpte invalidate.
841                          */
842                         if (pmd_none(pmd))
843                                 return NULL;
844
845                         if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
846                                 if (is_thp)
847                                         *is_thp = true;
848                                 ret_pte = (pte_t *) pmdp;
849                                 goto out;
850                         }
851
852                         if (pmd_huge(pmd)) {
853                                 ret_pte = (pte_t *) pmdp;
854                                 goto out;
855                         } else if (is_hugepd(__hugepd(pmd_val(pmd))))
856                                 hpdp = (hugepd_t *)&pmd;
857                         else
858                                 return pte_offset_kernel(&pmd, ea);
859                 }
860         }
861         if (!hpdp)
862                 return NULL;
863
864         ret_pte = hugepte_offset(*hpdp, ea, pdshift);
865         pdshift = hugepd_shift(*hpdp);
866 out:
867         if (hpage_shift)
868                 *hpage_shift = pdshift;
869         return ret_pte;
870 }
871 EXPORT_SYMBOL_GPL(__find_linux_pte);
872
873 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
874                 unsigned long end, int write, struct page **pages, int *nr)
875 {
876         unsigned long pte_end;
877         struct page *head, *page;
878         pte_t pte;
879         int refs;
880
881         pte_end = (addr + sz) & ~(sz-1);
882         if (pte_end < end)
883                 end = pte_end;
884
885         pte = READ_ONCE(*ptep);
886
887         if (!pte_access_permitted(pte, write))
888                 return 0;
889
890         /* hugepages are never "special" */
891         VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
892
893         refs = 0;
894         head = pte_page(pte);
895
896         page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
897         do {
898                 VM_BUG_ON(compound_head(page) != head);
899                 pages[*nr] = page;
900                 (*nr)++;
901                 page++;
902                 refs++;
903         } while (addr += PAGE_SIZE, addr != end);
904
905         if (!page_cache_add_speculative(head, refs)) {
906                 *nr -= refs;
907                 return 0;
908         }
909
910         if (unlikely(pte_val(pte) != pte_val(*ptep))) {
911                 /* Could be optimized better */
912                 *nr -= refs;
913                 while (refs--)
914                         put_page(head);
915                 return 0;
916         }
917
918         return 1;
919 }