GNU Linux-libre 4.14.266-gnu1
[releases.git] / arch / x86 / mm / dump_pagetables.c
1 /*
2  * Debug helper to dump the current kernel pagetables of the system
3  * so that we can see what the various memory ranges are set to.
4  *
5  * (C) Copyright 2008 Intel Corporation
6  *
7  * Author: Arjan van de Ven <arjan@linux.intel.com>
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License
11  * as published by the Free Software Foundation; version 2
12  * of the License.
13  */
14
15 #include <linux/debugfs.h>
16 #include <linux/kasan.h>
17 #include <linux/mm.h>
18 #include <linux/init.h>
19 #include <linux/sched.h>
20 #include <linux/seq_file.h>
21
22 #include <asm/pgtable.h>
23
24 /*
25  * The dumper groups pagetable entries of the same type into one, and for
26  * that it needs to keep some state when walking, and flush this state
27  * when a "break" in the continuity is found.
28  */
29 struct pg_state {
30         int level;
31         pgprot_t current_prot;
32         unsigned long start_address;
33         unsigned long current_address;
34         const struct addr_marker *marker;
35         unsigned long lines;
36         bool to_dmesg;
37         bool check_wx;
38         unsigned long wx_pages;
39 };
40
41 struct addr_marker {
42         unsigned long start_address;
43         const char *name;
44         unsigned long max_lines;
45 };
46
47 /* Address space markers hints */
48
49 #ifdef CONFIG_X86_64
50
51 enum address_markers_idx {
52         USER_SPACE_NR = 0,
53         KERNEL_SPACE_NR,
54 #ifdef CONFIG_MODIFY_LDT_SYSCALL
55         LDT_NR,
56 #endif
57         LOW_KERNEL_NR,
58         VMALLOC_START_NR,
59         VMEMMAP_START_NR,
60 #ifdef CONFIG_KASAN
61         KASAN_SHADOW_START_NR,
62         KASAN_SHADOW_END_NR,
63 #endif
64         CPU_ENTRY_AREA_NR,
65 #ifdef CONFIG_X86_ESPFIX64
66         ESPFIX_START_NR,
67 #endif
68 #ifdef CONFIG_EFI
69         EFI_END_NR,
70 #endif
71         HIGH_KERNEL_NR,
72         MODULES_VADDR_NR,
73         MODULES_END_NR,
74         FIXADDR_START_NR,
75         END_OF_SPACE_NR,
76 };
77
78 static struct addr_marker address_markers[] = {
79         [USER_SPACE_NR]         = { 0,                  "User Space" },
80         [KERNEL_SPACE_NR]       = { (1UL << 63),        "Kernel Space" },
81         [LOW_KERNEL_NR]         = { 0UL,                "Low Kernel Mapping" },
82         [VMALLOC_START_NR]      = { 0UL,                "vmalloc() Area" },
83         [VMEMMAP_START_NR]      = { 0UL,                "Vmemmap" },
84 #ifdef CONFIG_KASAN
85         [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
86         [KASAN_SHADOW_END_NR]   = { KASAN_SHADOW_END,   "KASAN shadow end" },
87 #endif
88 #ifdef CONFIG_MODIFY_LDT_SYSCALL
89         [LDT_NR]                = { LDT_BASE_ADDR,      "LDT remap" },
90 #endif
91         [CPU_ENTRY_AREA_NR]     = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
92 #ifdef CONFIG_X86_ESPFIX64
93         [ESPFIX_START_NR]       = { ESPFIX_BASE_ADDR,   "ESPfix Area", 16 },
94 #endif
95 #ifdef CONFIG_EFI
96         [EFI_END_NR]            = { EFI_VA_END,         "EFI Runtime Services" },
97 #endif
98         [HIGH_KERNEL_NR]        = { __START_KERNEL_map, "High Kernel Mapping" },
99         [MODULES_VADDR_NR]      = { MODULES_VADDR,      "Modules" },
100         [MODULES_END_NR]        = { MODULES_END,        "End Modules" },
101         [FIXADDR_START_NR]      = { FIXADDR_START,      "Fixmap Area" },
102         [END_OF_SPACE_NR]       = { -1,                 NULL }
103 };
104
105 #else /* CONFIG_X86_64 */
106
107 enum address_markers_idx {
108         USER_SPACE_NR = 0,
109         KERNEL_SPACE_NR,
110         VMALLOC_START_NR,
111         VMALLOC_END_NR,
112 #ifdef CONFIG_HIGHMEM
113         PKMAP_BASE_NR,
114 #endif
115         CPU_ENTRY_AREA_NR,
116         FIXADDR_START_NR,
117         END_OF_SPACE_NR,
118 };
119
120 static struct addr_marker address_markers[] = {
121         [USER_SPACE_NR]         = { 0,                  "User Space" },
122         [KERNEL_SPACE_NR]       = { PAGE_OFFSET,        "Kernel Mapping" },
123         [VMALLOC_START_NR]      = { 0UL,                "vmalloc() Area" },
124         [VMALLOC_END_NR]        = { 0UL,                "vmalloc() End" },
125 #ifdef CONFIG_HIGHMEM
126         [PKMAP_BASE_NR]         = { 0UL,                "Persistent kmap() Area" },
127 #endif
128         [CPU_ENTRY_AREA_NR]     = { 0UL,                "CPU entry area" },
129         [FIXADDR_START_NR]      = { 0UL,                "Fixmap area" },
130         [END_OF_SPACE_NR]       = { -1,                 NULL }
131 };
132
133 #endif /* !CONFIG_X86_64 */
134
135 /* Multipliers for offsets within the PTEs */
136 #define PTE_LEVEL_MULT (PAGE_SIZE)
137 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
138 #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
139 #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
140 #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
141
142 #define pt_dump_seq_printf(m, to_dmesg, fmt, args...)           \
143 ({                                                              \
144         if (to_dmesg)                                   \
145                 printk(KERN_INFO fmt, ##args);                  \
146         else                                                    \
147                 if (m)                                          \
148                         seq_printf(m, fmt, ##args);             \
149 })
150
151 #define pt_dump_cont_printf(m, to_dmesg, fmt, args...)          \
152 ({                                                              \
153         if (to_dmesg)                                   \
154                 printk(KERN_CONT fmt, ##args);                  \
155         else                                                    \
156                 if (m)                                          \
157                         seq_printf(m, fmt, ##args);             \
158 })
159
160 /*
161  * Print a readable form of a pgprot_t to the seq_file
162  */
163 static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
164 {
165         pgprotval_t pr = pgprot_val(prot);
166         static const char * const level_name[] =
167                 { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
168
169         if (!(pr & _PAGE_PRESENT)) {
170                 /* Not present */
171                 pt_dump_cont_printf(m, dmsg, "                              ");
172         } else {
173                 if (pr & _PAGE_USER)
174                         pt_dump_cont_printf(m, dmsg, "USR ");
175                 else
176                         pt_dump_cont_printf(m, dmsg, "    ");
177                 if (pr & _PAGE_RW)
178                         pt_dump_cont_printf(m, dmsg, "RW ");
179                 else
180                         pt_dump_cont_printf(m, dmsg, "ro ");
181                 if (pr & _PAGE_PWT)
182                         pt_dump_cont_printf(m, dmsg, "PWT ");
183                 else
184                         pt_dump_cont_printf(m, dmsg, "    ");
185                 if (pr & _PAGE_PCD)
186                         pt_dump_cont_printf(m, dmsg, "PCD ");
187                 else
188                         pt_dump_cont_printf(m, dmsg, "    ");
189
190                 /* Bit 7 has a different meaning on level 3 vs 4 */
191                 if (level <= 4 && pr & _PAGE_PSE)
192                         pt_dump_cont_printf(m, dmsg, "PSE ");
193                 else
194                         pt_dump_cont_printf(m, dmsg, "    ");
195                 if ((level == 5 && pr & _PAGE_PAT) ||
196                     ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
197                         pt_dump_cont_printf(m, dmsg, "PAT ");
198                 else
199                         pt_dump_cont_printf(m, dmsg, "    ");
200                 if (pr & _PAGE_GLOBAL)
201                         pt_dump_cont_printf(m, dmsg, "GLB ");
202                 else
203                         pt_dump_cont_printf(m, dmsg, "    ");
204                 if (pr & _PAGE_NX)
205                         pt_dump_cont_printf(m, dmsg, "NX ");
206                 else
207                         pt_dump_cont_printf(m, dmsg, "x  ");
208         }
209         pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
210 }
211
212 /*
213  * On 64 bits, sign-extend the 48 bit address to 64 bit
214  */
215 static unsigned long normalize_addr(unsigned long u)
216 {
217         int shift;
218         if (!IS_ENABLED(CONFIG_X86_64))
219                 return u;
220
221         shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
222         return (signed long)(u << shift) >> shift;
223 }
224
225 /*
226  * This function gets called on a break in a continuous series
227  * of PTE entries; the next one is different so we need to
228  * print what we collected so far.
229  */
230 static void note_page(struct seq_file *m, struct pg_state *st,
231                       pgprot_t new_prot, int level)
232 {
233         pgprotval_t prot, cur;
234         static const char units[] = "BKMGTPE";
235
236         /*
237          * If we have a "break" in the series, we need to flush the state that
238          * we have now. "break" is either changing perms, levels or
239          * address space marker.
240          */
241         prot = pgprot_val(new_prot);
242         cur = pgprot_val(st->current_prot);
243
244         if (!st->level) {
245                 /* First entry */
246                 st->current_prot = new_prot;
247                 st->level = level;
248                 st->marker = address_markers;
249                 st->lines = 0;
250                 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
251                                    st->marker->name);
252         } else if (prot != cur || level != st->level ||
253                    st->current_address >= st->marker[1].start_address) {
254                 const char *unit = units;
255                 unsigned long delta;
256                 int width = sizeof(unsigned long) * 2;
257                 pgprotval_t pr = pgprot_val(st->current_prot);
258
259                 if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
260                         WARN_ONCE(1,
261                                   "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
262                                   (void *)st->start_address,
263                                   (void *)st->start_address);
264                         st->wx_pages += (st->current_address -
265                                          st->start_address) / PAGE_SIZE;
266                 }
267
268                 /*
269                  * Now print the actual finished series
270                  */
271                 if (!st->marker->max_lines ||
272                     st->lines < st->marker->max_lines) {
273                         pt_dump_seq_printf(m, st->to_dmesg,
274                                            "0x%0*lx-0x%0*lx   ",
275                                            width, st->start_address,
276                                            width, st->current_address);
277
278                         delta = st->current_address - st->start_address;
279                         while (!(delta & 1023) && unit[1]) {
280                                 delta >>= 10;
281                                 unit++;
282                         }
283                         pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
284                                             delta, *unit);
285                         printk_prot(m, st->current_prot, st->level,
286                                     st->to_dmesg);
287                 }
288                 st->lines++;
289
290                 /*
291                  * We print markers for special areas of address space,
292                  * such as the start of vmalloc space etc.
293                  * This helps in the interpretation.
294                  */
295                 if (st->current_address >= st->marker[1].start_address) {
296                         if (st->marker->max_lines &&
297                             st->lines > st->marker->max_lines) {
298                                 unsigned long nskip =
299                                         st->lines - st->marker->max_lines;
300                                 pt_dump_seq_printf(m, st->to_dmesg,
301                                                    "... %lu entr%s skipped ... \n",
302                                                    nskip,
303                                                    nskip == 1 ? "y" : "ies");
304                         }
305                         st->marker++;
306                         st->lines = 0;
307                         pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
308                                            st->marker->name);
309                 }
310
311                 st->start_address = st->current_address;
312                 st->current_prot = new_prot;
313                 st->level = level;
314         }
315 }
316
317 static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
318 {
319         int i;
320         pte_t *start;
321         pgprotval_t prot;
322
323         start = (pte_t *)pmd_page_vaddr(addr);
324         for (i = 0; i < PTRS_PER_PTE; i++) {
325                 prot = pte_flags(*start);
326                 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
327                 note_page(m, st, __pgprot(prot), 5);
328                 start++;
329         }
330 }
331 #ifdef CONFIG_KASAN
332
333 /*
334  * This is an optimization for KASAN=y case. Since all kasan page tables
335  * eventually point to the kasan_zero_page we could call note_page()
336  * right away without walking through lower level page tables. This saves
337  * us dozens of seconds (minutes for 5-level config) while checking for
338  * W+X mapping or reading kernel_page_tables debugfs file.
339  */
340 static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
341                                 void *pt)
342 {
343         if (__pa(pt) == __pa(kasan_zero_pmd) ||
344 #ifdef CONFIG_X86_5LEVEL
345             __pa(pt) == __pa(kasan_zero_p4d) ||
346 #endif
347             __pa(pt) == __pa(kasan_zero_pud)) {
348                 pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
349                 note_page(m, st, __pgprot(prot), 5);
350                 return true;
351         }
352         return false;
353 }
354 #else
355 static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
356                                 void *pt)
357 {
358         return false;
359 }
360 #endif
361
362 #if PTRS_PER_PMD > 1
363
364 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
365 {
366         int i;
367         pmd_t *start, *pmd_start;
368         pgprotval_t prot;
369
370         pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
371         for (i = 0; i < PTRS_PER_PMD; i++) {
372                 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
373                 if (!pmd_none(*start)) {
374                         if (pmd_large(*start) || !pmd_present(*start)) {
375                                 prot = pmd_flags(*start);
376                                 note_page(m, st, __pgprot(prot), 4);
377                         } else if (!kasan_page_table(m, st, pmd_start)) {
378                                 walk_pte_level(m, st, *start,
379                                                P + i * PMD_LEVEL_MULT);
380                         }
381                 } else
382                         note_page(m, st, __pgprot(0), 4);
383                 start++;
384         }
385 }
386
387 #else
388 #define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
389 #define pud_large(a) pmd_large(__pmd(pud_val(a)))
390 #define pud_none(a)  pmd_none(__pmd(pud_val(a)))
391 #endif
392
393 #if PTRS_PER_PUD > 1
394
395 static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
396 {
397         int i;
398         pud_t *start, *pud_start;
399         pgprotval_t prot;
400         pud_t *prev_pud = NULL;
401
402         pud_start = start = (pud_t *)p4d_page_vaddr(addr);
403
404         for (i = 0; i < PTRS_PER_PUD; i++) {
405                 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
406                 if (!pud_none(*start)) {
407                         if (pud_large(*start) || !pud_present(*start)) {
408                                 prot = pud_flags(*start);
409                                 note_page(m, st, __pgprot(prot), 3);
410                         } else if (!kasan_page_table(m, st, pud_start)) {
411                                 walk_pmd_level(m, st, *start,
412                                                P + i * PUD_LEVEL_MULT);
413                         }
414                 } else
415                         note_page(m, st, __pgprot(0), 3);
416
417                 prev_pud = start;
418                 start++;
419         }
420 }
421
422 #else
423 #define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
424 #define p4d_large(a) pud_large(__pud(p4d_val(a)))
425 #define p4d_none(a)  pud_none(__pud(p4d_val(a)))
426 #endif
427
428 #if PTRS_PER_P4D > 1
429
430 static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
431 {
432         int i;
433         p4d_t *start, *p4d_start;
434         pgprotval_t prot;
435
436         p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
437
438         for (i = 0; i < PTRS_PER_P4D; i++) {
439                 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
440                 if (!p4d_none(*start)) {
441                         if (p4d_large(*start) || !p4d_present(*start)) {
442                                 prot = p4d_flags(*start);
443                                 note_page(m, st, __pgprot(prot), 2);
444                         } else if (!kasan_page_table(m, st, p4d_start)) {
445                                 walk_pud_level(m, st, *start,
446                                                P + i * P4D_LEVEL_MULT);
447                         }
448                 } else
449                         note_page(m, st, __pgprot(0), 2);
450
451                 start++;
452         }
453 }
454
455 #else
456 #define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
457 #define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
458 #define pgd_none(a)  p4d_none(__p4d(pgd_val(a)))
459 #endif
460
461 static inline bool is_hypervisor_range(int idx)
462 {
463 #ifdef CONFIG_X86_64
464         /*
465          * A hole in the beginning of kernel address space reserved
466          * for a hypervisor.
467          */
468         return  (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) &&
469                 (idx <  pgd_index(GUARD_HOLE_END_ADDR));
470 #else
471         return false;
472 #endif
473 }
474
475 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
476                                        bool checkwx, bool dmesg)
477 {
478 #ifdef CONFIG_X86_64
479         pgd_t *start = (pgd_t *) &init_top_pgt;
480 #else
481         pgd_t *start = swapper_pg_dir;
482 #endif
483         pgprotval_t prot;
484         int i;
485         struct pg_state st = {};
486
487         if (pgd) {
488                 start = pgd;
489                 st.to_dmesg = dmesg;
490         }
491
492         st.check_wx = checkwx;
493         if (checkwx)
494                 st.wx_pages = 0;
495
496         for (i = 0; i < PTRS_PER_PGD; i++) {
497                 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
498                 if (!pgd_none(*start) && !is_hypervisor_range(i)) {
499                         if (pgd_large(*start) || !pgd_present(*start)) {
500                                 prot = pgd_flags(*start);
501                                 note_page(m, &st, __pgprot(prot), 1);
502                         } else {
503                                 walk_p4d_level(m, &st, *start,
504                                                i * PGD_LEVEL_MULT);
505                         }
506                 } else
507                         note_page(m, &st, __pgprot(0), 1);
508
509                 cond_resched();
510                 start++;
511         }
512
513         /* Flush out the last page */
514         st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
515         note_page(m, &st, __pgprot(0), 0);
516         if (!checkwx)
517                 return;
518         if (st.wx_pages)
519                 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
520                         st.wx_pages);
521         else
522                 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
523 }
524
525 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
526 {
527         ptdump_walk_pgd_level_core(m, pgd, false, true);
528 }
529
530 void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
531 {
532 #ifdef CONFIG_PAGE_TABLE_ISOLATION
533         if (user && static_cpu_has(X86_FEATURE_PTI))
534                 pgd = kernel_to_user_pgdp(pgd);
535 #endif
536         ptdump_walk_pgd_level_core(m, pgd, false, false);
537 }
538 EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
539
540 static void ptdump_walk_user_pgd_level_checkwx(void)
541 {
542 #ifdef CONFIG_PAGE_TABLE_ISOLATION
543         pgd_t *pgd = (pgd_t *) &init_top_pgt;
544
545         if (!static_cpu_has(X86_FEATURE_PTI))
546                 return;
547
548         pr_info("x86/mm: Checking user space page tables\n");
549         pgd = kernel_to_user_pgdp(pgd);
550         ptdump_walk_pgd_level_core(NULL, pgd, true, false);
551 #endif
552 }
553
554 void ptdump_walk_pgd_level_checkwx(void)
555 {
556         ptdump_walk_pgd_level_core(NULL, NULL, true, false);
557         ptdump_walk_user_pgd_level_checkwx();
558 }
559
560 static int __init pt_dump_init(void)
561 {
562         /*
563          * Various markers are not compile-time constants, so assign them
564          * here.
565          */
566 #ifdef CONFIG_X86_64
567         address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
568         address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
569         address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
570 #endif
571 #ifdef CONFIG_X86_32
572         address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
573         address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
574 # ifdef CONFIG_HIGHMEM
575         address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
576 # endif
577         address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
578         address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
579 #endif
580         return 0;
581 }
582 __initcall(pt_dump_init);