2 * Re-map IO memory to kernel address space so that we can access it.
3 * This is needed for high PCI addresses that aren't mapped in the
4 * 640k-1MB IO memory area on PC's
6 * (C) Copyright 1995 1996 Linus Torvalds
9 #include <linux/bootmem.h>
10 #include <linux/init.h>
12 #include <linux/ioport.h>
13 #include <linux/slab.h>
14 #include <linux/vmalloc.h>
15 #include <linux/mmiotrace.h>
16 #include <linux/mem_encrypt.h>
17 #include <linux/efi.h>
19 #include <asm/set_memory.h>
20 #include <asm/e820/api.h>
21 #include <asm/fixmap.h>
22 #include <asm/pgtable.h>
23 #include <asm/tlbflush.h>
24 #include <asm/pgalloc.h>
26 #include <asm/setup.h>
30 struct ioremap_mem_flags {
36 * Fix up the linear direct mapping of the kernel to avoid cache attribute
39 int ioremap_change_attr(unsigned long vaddr, unsigned long size,
40 enum page_cache_mode pcm)
42 unsigned long nrpages = size >> PAGE_SHIFT;
46 case _PAGE_CACHE_MODE_UC:
48 err = _set_memory_uc(vaddr, nrpages);
50 case _PAGE_CACHE_MODE_WC:
51 err = _set_memory_wc(vaddr, nrpages);
53 case _PAGE_CACHE_MODE_WT:
54 err = _set_memory_wt(vaddr, nrpages);
56 case _PAGE_CACHE_MODE_WB:
57 err = _set_memory_wb(vaddr, nrpages);
64 static bool __ioremap_check_ram(struct resource *res)
66 unsigned long start_pfn, stop_pfn;
69 if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
72 start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
73 stop_pfn = (res->end + 1) >> PAGE_SHIFT;
74 if (stop_pfn > start_pfn) {
75 for (i = 0; i < (stop_pfn - start_pfn); ++i)
76 if (pfn_valid(start_pfn + i) &&
77 !PageReserved(pfn_to_page(start_pfn + i)))
84 static int __ioremap_check_desc_other(struct resource *res)
86 return (res->desc != IORES_DESC_NONE);
89 static int __ioremap_res_check(struct resource *res, void *arg)
91 struct ioremap_mem_flags *flags = arg;
93 if (!flags->system_ram)
94 flags->system_ram = __ioremap_check_ram(res);
96 if (!flags->desc_other)
97 flags->desc_other = __ioremap_check_desc_other(res);
99 return flags->system_ram && flags->desc_other;
103 * To avoid multiple resource walks, this function walks resources marked as
104 * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a
105 * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
107 static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
108 struct ioremap_mem_flags *flags)
113 end = start + size - 1;
114 memset(flags, 0, sizeof(*flags));
116 walk_mem_res(start, end, flags, __ioremap_res_check);
120 * Remap an arbitrary physical address space into the kernel virtual
121 * address space. It transparently creates kernel huge I/O mapping when
122 * the physical address is aligned by a huge page size (1GB or 2MB) and
123 * the requested size is at least the huge page size.
125 * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
126 * Therefore, the mapping code falls back to use a smaller page toward 4KB
127 * when a mapping range is covered by non-WB type of MTRRs.
129 * NOTE! We need to allow non-page-aligned mappings too: we will obviously
130 * have to convert them into an offset in a page-aligned mapping, but the
131 * caller shouldn't need to know that small detail.
133 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
134 unsigned long size, enum page_cache_mode pcm, void *caller)
136 unsigned long offset, vaddr;
137 resource_size_t last_addr;
138 const resource_size_t unaligned_phys_addr = phys_addr;
139 const unsigned long unaligned_size = size;
140 struct ioremap_mem_flags mem_flags;
141 struct vm_struct *area;
142 enum page_cache_mode new_pcm;
145 void __iomem *ret_addr;
147 /* Don't allow wraparound or zero size */
148 last_addr = phys_addr + size - 1;
149 if (!size || last_addr < phys_addr)
152 if (!phys_addr_valid(phys_addr)) {
153 printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
154 (unsigned long long)phys_addr);
159 __ioremap_check_mem(phys_addr, size, &mem_flags);
162 * Don't allow anybody to remap normal RAM that we're using..
164 if (mem_flags.system_ram) {
165 WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
166 &phys_addr, &last_addr);
171 * Mappings have to be page-aligned
173 offset = phys_addr & ~PAGE_MASK;
174 phys_addr &= PAGE_MASK;
175 size = PAGE_ALIGN(last_addr+1) - phys_addr;
178 * Mask out any bits not part of the actual physical
179 * address, like memory encryption bits.
181 phys_addr &= PHYSICAL_PAGE_MASK;
183 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
186 printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
190 if (pcm != new_pcm) {
191 if (!is_new_memtype_allowed(phys_addr, size, pcm, new_pcm)) {
193 "ioremap error for 0x%llx-0x%llx, requested 0x%x, got 0x%x\n",
194 (unsigned long long)phys_addr,
195 (unsigned long long)(phys_addr + size),
197 goto err_free_memtype;
203 * If the page being mapped is in memory and SEV is active then
204 * make sure the memory encryption attribute is enabled in the
207 prot = PAGE_KERNEL_IO;
208 if (sev_active() && mem_flags.desc_other)
209 prot = pgprot_encrypted(prot);
212 case _PAGE_CACHE_MODE_UC:
214 prot = __pgprot(pgprot_val(prot) |
215 cachemode2protval(_PAGE_CACHE_MODE_UC));
217 case _PAGE_CACHE_MODE_UC_MINUS:
218 prot = __pgprot(pgprot_val(prot) |
219 cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
221 case _PAGE_CACHE_MODE_WC:
222 prot = __pgprot(pgprot_val(prot) |
223 cachemode2protval(_PAGE_CACHE_MODE_WC));
225 case _PAGE_CACHE_MODE_WT:
226 prot = __pgprot(pgprot_val(prot) |
227 cachemode2protval(_PAGE_CACHE_MODE_WT));
229 case _PAGE_CACHE_MODE_WB:
236 area = get_vm_area_caller(size, VM_IOREMAP, caller);
238 goto err_free_memtype;
239 area->phys_addr = phys_addr;
240 vaddr = (unsigned long) area->addr;
242 if (kernel_map_sync_memtype(phys_addr, size, pcm))
245 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
248 ret_addr = (void __iomem *) (vaddr + offset);
249 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
252 * Check if the request spans more than any BAR in the iomem resource
255 if (iomem_map_sanity_check(unaligned_phys_addr, unaligned_size))
256 pr_warn("caller %pS mapping multiple BARs\n", caller);
262 free_memtype(phys_addr, phys_addr + size);
267 * ioremap_nocache - map bus memory into CPU space
268 * @phys_addr: bus address of the memory
269 * @size: size of the resource to map
271 * ioremap_nocache performs a platform specific sequence of operations to
272 * make bus memory CPU accessible via the readb/readw/readl/writeb/
273 * writew/writel functions and the other mmio helpers. The returned
274 * address is not guaranteed to be usable directly as a virtual
277 * This version of ioremap ensures that the memory is marked uncachable
278 * on the CPU as well as honouring existing caching rules from things like
279 * the PCI bus. Note that there are other caches and buffers on many
280 * busses. In particular driver authors should read up on PCI writes
282 * It's useful if some control registers are in such an area and
283 * write combining or read caching is not desirable:
285 * Must be freed with iounmap.
287 void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
290 * Ideally, this should be:
291 * pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
293 * Till we fix all X drivers to use ioremap_wc(), we will use
294 * UC MINUS. Drivers that are certain they need or can already
295 * be converted over to strong UC can use ioremap_uc().
297 enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
299 return __ioremap_caller(phys_addr, size, pcm,
300 __builtin_return_address(0));
302 EXPORT_SYMBOL(ioremap_nocache);
305 * ioremap_uc - map bus memory into CPU space as strongly uncachable
306 * @phys_addr: bus address of the memory
307 * @size: size of the resource to map
309 * ioremap_uc performs a platform specific sequence of operations to
310 * make bus memory CPU accessible via the readb/readw/readl/writeb/
311 * writew/writel functions and the other mmio helpers. The returned
312 * address is not guaranteed to be usable directly as a virtual
315 * This version of ioremap ensures that the memory is marked with a strong
316 * preference as completely uncachable on the CPU when possible. For non-PAT
317 * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT
318 * systems this will set the PAT entry for the pages as strong UC. This call
319 * will honor existing caching rules from things like the PCI bus. Note that
320 * there are other caches and buffers on many busses. In particular driver
321 * authors should read up on PCI writes.
323 * It's useful if some control registers are in such an area and
324 * write combining or read caching is not desirable:
326 * Must be freed with iounmap.
328 void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
330 enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
332 return __ioremap_caller(phys_addr, size, pcm,
333 __builtin_return_address(0));
335 EXPORT_SYMBOL_GPL(ioremap_uc);
338 * ioremap_wc - map memory into CPU space write combined
339 * @phys_addr: bus address of the memory
340 * @size: size of the resource to map
342 * This version of ioremap ensures that the memory is marked write combining.
343 * Write combining allows faster writes to some hardware devices.
345 * Must be freed with iounmap.
347 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
349 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
350 __builtin_return_address(0));
352 EXPORT_SYMBOL(ioremap_wc);
355 * ioremap_wt - map memory into CPU space write through
356 * @phys_addr: bus address of the memory
357 * @size: size of the resource to map
359 * This version of ioremap ensures that the memory is marked write through.
360 * Write through stores data into memory while keeping the cache up-to-date.
362 * Must be freed with iounmap.
364 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
366 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
367 __builtin_return_address(0));
369 EXPORT_SYMBOL(ioremap_wt);
371 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
373 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
374 __builtin_return_address(0));
376 EXPORT_SYMBOL(ioremap_cache);
378 void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
379 unsigned long prot_val)
381 return __ioremap_caller(phys_addr, size,
382 pgprot2cachemode(__pgprot(prot_val)),
383 __builtin_return_address(0));
385 EXPORT_SYMBOL(ioremap_prot);
388 * iounmap - Free a IO remapping
389 * @addr: virtual address from ioremap_*
391 * Caller must ensure there is only one unmapping for the same pointer.
393 void iounmap(volatile void __iomem *addr)
395 struct vm_struct *p, *o;
397 if ((void __force *)addr <= high_memory)
401 * The PCI/ISA range special-casing was removed from __ioremap()
402 * so this check, in theory, can be removed. However, there are
403 * cases where iounmap() is called for addresses not obtained via
404 * ioremap() (vga16fb for example). Add a warning so that these
405 * cases can be caught and fixed.
407 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
408 (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) {
409 WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n");
413 mmiotrace_iounmap(addr);
415 addr = (volatile void __iomem *)
416 (PAGE_MASK & (unsigned long __force)addr);
418 /* Use the vm area unlocked, assuming the caller
419 ensures there isn't another iounmap for the same address
420 in parallel. Reuse of the virtual address is prevented by
421 leaving it in the global lists until we're done with it.
422 cpa takes care of the direct mappings. */
423 p = find_vm_area((void __force *)addr);
426 printk(KERN_ERR "iounmap: bad address %p\n", addr);
431 free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
433 /* Finally remove it */
434 o = remove_vm_area((void __force *)addr);
435 BUG_ON(p != o || o == NULL);
438 EXPORT_SYMBOL(iounmap);
440 int __init arch_ioremap_pud_supported(void)
443 return boot_cpu_has(X86_FEATURE_GBPAGES);
449 int __init arch_ioremap_pmd_supported(void)
451 return boot_cpu_has(X86_FEATURE_PSE);
455 * Convert a physical pointer to a virtual kernel pointer for /dev/mem
458 void *xlate_dev_mem_ptr(phys_addr_t phys)
460 unsigned long start = phys & PAGE_MASK;
461 unsigned long offset = phys & ~PAGE_MASK;
464 /* memremap() maps if RAM, otherwise falls back to ioremap() */
465 vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB);
467 /* Only add the offset on success and return NULL if memremap() failed */
474 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
476 memunmap((void *)((unsigned long)addr & PAGE_MASK));
480 * Examine the physical address to determine if it is an area of memory
481 * that should be mapped decrypted. If the memory is not part of the
482 * kernel usable area it was accessed and created decrypted, so these
483 * areas should be mapped decrypted. And since the encryption key can
484 * change across reboots, persistent memory should also be mapped
487 * If SEV is active, that implies that BIOS/UEFI also ran encrypted so
488 * only persistent memory should be mapped decrypted.
490 static bool memremap_should_map_decrypted(resource_size_t phys_addr,
496 * Check if the address is part of a persistent memory region.
497 * This check covers areas added by E820, EFI and ACPI.
499 is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM,
500 IORES_DESC_PERSISTENT_MEMORY);
501 if (is_pmem != REGION_DISJOINT)
505 * Check if the non-volatile attribute is set for an EFI
508 if (efi_enabled(EFI_BOOT)) {
509 switch (efi_mem_type(phys_addr)) {
510 case EFI_RESERVED_TYPE:
511 if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV)
519 /* Check if the address is outside kernel usable area */
520 switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) {
521 case E820_TYPE_RESERVED:
524 case E820_TYPE_UNUSABLE:
525 /* For SEV, these areas are encrypted */
540 * Examine the physical address to determine if it is EFI data. Check
541 * it against the boot params structure and EFI tables and memory types.
543 static bool memremap_is_efi_data(resource_size_t phys_addr,
548 /* Check if the address is part of EFI boot/runtime data */
549 if (!efi_enabled(EFI_BOOT))
552 paddr = boot_params.efi_info.efi_memmap_hi;
554 paddr |= boot_params.efi_info.efi_memmap;
555 if (phys_addr == paddr)
558 paddr = boot_params.efi_info.efi_systab_hi;
560 paddr |= boot_params.efi_info.efi_systab;
561 if (phys_addr == paddr)
564 if (efi_is_table_address(phys_addr))
567 switch (efi_mem_type(phys_addr)) {
568 case EFI_BOOT_SERVICES_DATA:
569 case EFI_RUNTIME_SERVICES_DATA:
579 * Examine the physical address to determine if it is boot data by checking
580 * it against the boot params setup_data chain.
582 static bool memremap_is_setup_data(resource_size_t phys_addr,
585 struct setup_data *data;
586 u64 paddr, paddr_next;
588 paddr = boot_params.hdr.setup_data;
592 if (phys_addr == paddr)
595 data = memremap(paddr, sizeof(*data),
596 MEMREMAP_WB | MEMREMAP_DEC);
598 paddr_next = data->next;
603 if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
613 * Examine the physical address to determine if it is boot data by checking
614 * it against the boot params setup_data chain (early boot version).
616 static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
619 struct setup_data *data;
620 u64 paddr, paddr_next;
622 paddr = boot_params.hdr.setup_data;
626 if (phys_addr == paddr)
629 data = early_memremap_decrypted(paddr, sizeof(*data));
631 paddr_next = data->next;
634 early_memunmap(data, sizeof(*data));
636 if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
646 * Architecture function to determine if RAM remap is allowed. By default, a
647 * RAM remap will map the data as encrypted. Determine if a RAM remap should
648 * not be done so that the data will be mapped decrypted.
650 bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
653 if (!mem_encrypt_active())
656 if (flags & MEMREMAP_ENC)
659 if (flags & MEMREMAP_DEC)
663 if (memremap_is_setup_data(phys_addr, size) ||
664 memremap_is_efi_data(phys_addr, size))
668 return !memremap_should_map_decrypted(phys_addr, size);
672 * Architecture override of __weak function to adjust the protection attributes
673 * used when remapping memory. By default, early_memremap() will map the data
674 * as encrypted. Determine if an encrypted mapping should not be done and set
675 * the appropriate protection attributes.
677 pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
683 if (!mem_encrypt_active())
686 encrypted_prot = true;
689 if (early_memremap_is_setup_data(phys_addr, size) ||
690 memremap_is_efi_data(phys_addr, size))
691 encrypted_prot = false;
694 if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size))
695 encrypted_prot = false;
697 return encrypted_prot ? pgprot_encrypted(prot)
698 : pgprot_decrypted(prot);
701 bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
703 return arch_memremap_can_ram_remap(phys_addr, size, 0);
706 #ifdef CONFIG_AMD_MEM_ENCRYPT
707 /* Remap memory with encryption */
708 void __init *early_memremap_encrypted(resource_size_t phys_addr,
711 return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC);
715 * Remap memory with encryption and write-protected - cannot be called
716 * before pat_init() is called
718 void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
721 /* Be sure the write-protect PAT entry is set for write-protect */
722 if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
725 return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
728 /* Remap memory without encryption */
729 void __init *early_memremap_decrypted(resource_size_t phys_addr,
732 return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC);
736 * Remap memory without encryption and write-protected - cannot be called
737 * before pat_init() is called
739 void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
742 /* Be sure the write-protect PAT entry is set for write-protect */
743 if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
746 return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
748 #endif /* CONFIG_AMD_MEM_ENCRYPT */
750 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
752 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
754 /* Don't assume we're using swapper_pg_dir at this point */
755 pgd_t *base = __va(read_cr3_pa());
756 pgd_t *pgd = &base[pgd_index(addr)];
757 p4d_t *p4d = p4d_offset(pgd, addr);
758 pud_t *pud = pud_offset(p4d, addr);
759 pmd_t *pmd = pmd_offset(pud, addr);
764 static inline pte_t * __init early_ioremap_pte(unsigned long addr)
766 return &bm_pte[pte_index(addr)];
769 bool __init is_early_ioremap_ptep(pte_t *ptep)
771 return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
774 void __init early_ioremap_init(void)
779 BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
781 WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
784 early_ioremap_setup();
786 pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
787 memset(bm_pte, 0, sizeof(bm_pte));
788 pmd_populate_kernel(&init_mm, pmd, bm_pte);
791 * The boot-ioremap range spans multiple pmds, for which
792 * we are not prepared:
794 #define __FIXADDR_TOP (-PAGE_SIZE)
795 BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
796 != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
798 if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
800 printk(KERN_WARNING "pmd %p != %p\n",
801 pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
802 printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
803 fix_to_virt(FIX_BTMAP_BEGIN));
804 printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
805 fix_to_virt(FIX_BTMAP_END));
807 printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
808 printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
813 void __init __early_set_fixmap(enum fixed_addresses idx,
814 phys_addr_t phys, pgprot_t flags)
816 unsigned long addr = __fix_to_virt(idx);
819 if (idx >= __end_of_fixed_addresses) {
823 pte = early_ioremap_pte(addr);
825 /* Sanitize 'prot' against any unsupported bits: */
826 pgprot_val(flags) &= __default_kernel_pte_mask;
828 if (pgprot_val(flags))
829 set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
831 pte_clear(&init_mm, addr, pte);
832 __flush_tlb_one_kernel(addr);