GNU Linux-libre 4.4.288-gnu1
[releases.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/io.h>
38 #include <linux/iova.h>
39 #include <linux/iommu.h>
40 #include <linux/intel-iommu.h>
41 #include <linux/syscore_ops.h>
42 #include <linux/tboot.h>
43 #include <linux/dmi.h>
44 #include <linux/pci-ats.h>
45 #include <linux/memblock.h>
46 #include <linux/dma-contiguous.h>
47 #include <linux/crash_dump.h>
48 #include <asm/irq_remapping.h>
49 #include <asm/cacheflush.h>
50 #include <asm/iommu.h>
51
52 #include "irq_remapping.h"
53
54 #define ROOT_SIZE               VTD_PAGE_SIZE
55 #define CONTEXT_SIZE            VTD_PAGE_SIZE
56
57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61
62 #define IOAPIC_RANGE_START      (0xfee00000)
63 #define IOAPIC_RANGE_END        (0xfeefffff)
64 #define IOVA_START_ADDR         (0x1000)
65
66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
67
68 #define MAX_AGAW_WIDTH 64
69 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70
71 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73
74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
75    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
76 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
77                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
78 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79
80 /* IO virtual address start page frame number */
81 #define IOVA_START_PFN          (1)
82
83 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
84 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
85 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
86
87 /* page table handling */
88 #define LEVEL_STRIDE            (9)
89 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
90
91 /*
92  * This bitmap is used to advertise the page sizes our hardware support
93  * to the IOMMU core, which will then use this information to split
94  * physically contiguous memory regions it is mapping into page sizes
95  * that we support.
96  *
97  * Traditionally the IOMMU core just handed us the mappings directly,
98  * after making sure the size is an order of a 4KiB page and that the
99  * mapping has natural alignment.
100  *
101  * To retain this behavior, we currently advertise that we support
102  * all page sizes that are an order of 4KiB.
103  *
104  * If at some point we'd like to utilize the IOMMU core's new behavior,
105  * we could change this to advertise the real page sizes we support.
106  */
107 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
108
109 static inline int agaw_to_level(int agaw)
110 {
111         return agaw + 2;
112 }
113
114 static inline int agaw_to_width(int agaw)
115 {
116         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
117 }
118
119 static inline int width_to_agaw(int width)
120 {
121         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
122 }
123
124 static inline unsigned int level_to_offset_bits(int level)
125 {
126         return (level - 1) * LEVEL_STRIDE;
127 }
128
129 static inline int pfn_level_offset(unsigned long pfn, int level)
130 {
131         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
132 }
133
134 static inline unsigned long level_mask(int level)
135 {
136         return -1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long level_size(int level)
140 {
141         return 1UL << level_to_offset_bits(level);
142 }
143
144 static inline unsigned long align_to_level(unsigned long pfn, int level)
145 {
146         return (pfn + level_size(level) - 1) & level_mask(level);
147 }
148
149 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
150 {
151         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
152 }
153
154 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
155    are never going to work. */
156 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
157 {
158         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160
161 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
162 {
163         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
164 }
165 static inline unsigned long page_to_dma_pfn(struct page *pg)
166 {
167         return mm_to_dma_pfn(page_to_pfn(pg));
168 }
169 static inline unsigned long virt_to_dma_pfn(void *p)
170 {
171         return page_to_dma_pfn(virt_to_page(p));
172 }
173
174 /* global iommu list, set NULL for ignored DMAR units */
175 static struct intel_iommu **g_iommus;
176
177 static void __init check_tylersburg_isoch(void);
178 static int rwbf_quirk;
179
180 /*
181  * set to 1 to panic kernel if can't successfully enable VT-d
182  * (used when kernel is launched w/ TXT)
183  */
184 static int force_on = 0;
185
186 /*
187  * 0: Present
188  * 1-11: Reserved
189  * 12-63: Context Ptr (12 - (haw-1))
190  * 64-127: Reserved
191  */
192 struct root_entry {
193         u64     lo;
194         u64     hi;
195 };
196 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
197
198 /*
199  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
200  * if marked present.
201  */
202 static phys_addr_t root_entry_lctp(struct root_entry *re)
203 {
204         if (!(re->lo & 1))
205                 return 0;
206
207         return re->lo & VTD_PAGE_MASK;
208 }
209
210 /*
211  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
212  * if marked present.
213  */
214 static phys_addr_t root_entry_uctp(struct root_entry *re)
215 {
216         if (!(re->hi & 1))
217                 return 0;
218
219         return re->hi & VTD_PAGE_MASK;
220 }
221 /*
222  * low 64 bits:
223  * 0: present
224  * 1: fault processing disable
225  * 2-3: translation type
226  * 12-63: address space root
227  * high 64 bits:
228  * 0-2: address width
229  * 3-6: aval
230  * 8-23: domain id
231  */
232 struct context_entry {
233         u64 lo;
234         u64 hi;
235 };
236
237 static inline void context_clear_pasid_enable(struct context_entry *context)
238 {
239         context->lo &= ~(1ULL << 11);
240 }
241
242 static inline bool context_pasid_enabled(struct context_entry *context)
243 {
244         return !!(context->lo & (1ULL << 11));
245 }
246
247 static inline void context_set_copied(struct context_entry *context)
248 {
249         context->hi |= (1ull << 3);
250 }
251
252 static inline bool context_copied(struct context_entry *context)
253 {
254         return !!(context->hi & (1ULL << 3));
255 }
256
257 static inline bool __context_present(struct context_entry *context)
258 {
259         return (context->lo & 1);
260 }
261
262 static inline bool context_present(struct context_entry *context)
263 {
264         return context_pasid_enabled(context) ?
265              __context_present(context) :
266              __context_present(context) && !context_copied(context);
267 }
268
269 static inline void context_set_present(struct context_entry *context)
270 {
271         context->lo |= 1;
272 }
273
274 static inline void context_set_fault_enable(struct context_entry *context)
275 {
276         context->lo &= (((u64)-1) << 2) | 1;
277 }
278
279 static inline void context_set_translation_type(struct context_entry *context,
280                                                 unsigned long value)
281 {
282         context->lo &= (((u64)-1) << 4) | 3;
283         context->lo |= (value & 3) << 2;
284 }
285
286 static inline void context_set_address_root(struct context_entry *context,
287                                             unsigned long value)
288 {
289         context->lo &= ~VTD_PAGE_MASK;
290         context->lo |= value & VTD_PAGE_MASK;
291 }
292
293 static inline void context_set_address_width(struct context_entry *context,
294                                              unsigned long value)
295 {
296         context->hi |= value & 7;
297 }
298
299 static inline void context_set_domain_id(struct context_entry *context,
300                                          unsigned long value)
301 {
302         context->hi |= (value & ((1 << 16) - 1)) << 8;
303 }
304
305 static inline int context_domain_id(struct context_entry *c)
306 {
307         return((c->hi >> 8) & 0xffff);
308 }
309
310 static inline void context_clear_entry(struct context_entry *context)
311 {
312         context->lo = 0;
313         context->hi = 0;
314 }
315
316 /*
317  * 0: readable
318  * 1: writable
319  * 2-6: reserved
320  * 7: super page
321  * 8-10: available
322  * 11: snoop behavior
323  * 12-63: Host physcial address
324  */
325 struct dma_pte {
326         u64 val;
327 };
328
329 static inline void dma_clear_pte(struct dma_pte *pte)
330 {
331         pte->val = 0;
332 }
333
334 static inline u64 dma_pte_addr(struct dma_pte *pte)
335 {
336 #ifdef CONFIG_64BIT
337         return pte->val & VTD_PAGE_MASK;
338 #else
339         /* Must have a full atomic 64-bit read */
340         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
341 #endif
342 }
343
344 static inline bool dma_pte_present(struct dma_pte *pte)
345 {
346         return (pte->val & 3) != 0;
347 }
348
349 static inline bool dma_pte_superpage(struct dma_pte *pte)
350 {
351         return (pte->val & DMA_PTE_LARGE_PAGE);
352 }
353
354 static inline int first_pte_in_page(struct dma_pte *pte)
355 {
356         return !((unsigned long)pte & ~VTD_PAGE_MASK);
357 }
358
359 /*
360  * This domain is a statically identity mapping domain.
361  *      1. This domain creats a static 1:1 mapping to all usable memory.
362  *      2. It maps to each iommu if successful.
363  *      3. Each iommu mapps to this domain if successful.
364  */
365 static struct dmar_domain *si_domain;
366 static int hw_pass_through = 1;
367
368 /*
369  * Domain represents a virtual machine, more than one devices
370  * across iommus may be owned in one domain, e.g. kvm guest.
371  */
372 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
373
374 /* si_domain contains mulitple devices */
375 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
376
377 #define for_each_domain_iommu(idx, domain)                      \
378         for (idx = 0; idx < g_num_of_iommus; idx++)             \
379                 if (domain->iommu_refcnt[idx])
380
381 struct dmar_domain {
382         int     nid;                    /* node id */
383
384         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
385                                         /* Refcount of devices per iommu */
386
387
388         u16             iommu_did[DMAR_UNITS_SUPPORTED];
389                                         /* Domain ids per IOMMU. Use u16 since
390                                          * domain ids are 16 bit wide according
391                                          * to VT-d spec, section 9.3 */
392
393         struct list_head devices;       /* all devices' list */
394         struct iova_domain iovad;       /* iova's that belong to this domain */
395
396         struct dma_pte  *pgd;           /* virtual address */
397         int             gaw;            /* max guest address width */
398
399         /* adjusted guest address width, 0 is level 2 30-bit */
400         int             agaw;
401
402         int             flags;          /* flags to find out type of domain */
403
404         int             iommu_coherency;/* indicate coherency of iommu access */
405         int             iommu_snooping; /* indicate snooping control feature*/
406         int             iommu_count;    /* reference count of iommu */
407         int             iommu_superpage;/* Level of superpages supported:
408                                            0 == 4KiB (no superpages), 1 == 2MiB,
409                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
410         u64             max_addr;       /* maximum mapped address */
411
412         struct iommu_domain domain;     /* generic domain data structure for
413                                            iommu core */
414 };
415
416 /* PCI domain-device relationship */
417 struct device_domain_info {
418         struct list_head link;  /* link to domain siblings */
419         struct list_head global; /* link to global list */
420         u8 bus;                 /* PCI bus number */
421         u8 devfn;               /* PCI devfn number */
422         u16 pfsid;              /* SRIOV physical function source ID */
423         u8 pasid_supported:3;
424         u8 pasid_enabled:1;
425         u8 pri_supported:1;
426         u8 pri_enabled:1;
427         u8 ats_supported:1;
428         u8 ats_enabled:1;
429         u8 ats_qdep;
430         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
431         struct intel_iommu *iommu; /* IOMMU used by this device */
432         struct dmar_domain *domain; /* pointer to domain */
433 };
434
435 struct dmar_rmrr_unit {
436         struct list_head list;          /* list of rmrr units   */
437         struct acpi_dmar_header *hdr;   /* ACPI header          */
438         u64     base_address;           /* reserved base address*/
439         u64     end_address;            /* reserved end address */
440         struct dmar_dev_scope *devices; /* target devices */
441         int     devices_cnt;            /* target device count */
442 };
443
444 struct dmar_atsr_unit {
445         struct list_head list;          /* list of ATSR units */
446         struct acpi_dmar_header *hdr;   /* ACPI header */
447         struct dmar_dev_scope *devices; /* target devices */
448         int devices_cnt;                /* target device count */
449         u8 include_all:1;               /* include all ports */
450 };
451
452 static LIST_HEAD(dmar_atsr_units);
453 static LIST_HEAD(dmar_rmrr_units);
454
455 #define for_each_rmrr_units(rmrr) \
456         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
457
458 static void flush_unmaps_timeout(unsigned long data);
459
460 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
461
462 #define HIGH_WATER_MARK 250
463 struct deferred_flush_tables {
464         int next;
465         struct iova *iova[HIGH_WATER_MARK];
466         struct dmar_domain *domain[HIGH_WATER_MARK];
467         struct page *freelist[HIGH_WATER_MARK];
468 };
469
470 static struct deferred_flush_tables *deferred_flush;
471
472 /* bitmap for indexing intel_iommus */
473 static int g_num_of_iommus;
474
475 static DEFINE_SPINLOCK(async_umap_flush_lock);
476 static LIST_HEAD(unmaps_to_do);
477
478 static int timer_on;
479 static long list_size;
480
481 static void domain_exit(struct dmar_domain *domain);
482 static void domain_remove_dev_info(struct dmar_domain *domain);
483 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
484                                      struct device *dev);
485 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
486 static void domain_context_clear(struct intel_iommu *iommu,
487                                  struct device *dev);
488 static int domain_detach_iommu(struct dmar_domain *domain,
489                                struct intel_iommu *iommu);
490
491 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
492 int dmar_disabled = 0;
493 #else
494 int dmar_disabled = 1;
495 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
496
497 int intel_iommu_enabled = 0;
498 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
499
500 static int dmar_map_gfx = 1;
501 static int dmar_forcedac;
502 static int intel_iommu_strict;
503 static int intel_iommu_superpage = 1;
504 static int intel_iommu_ecs = 1;
505 static int intel_iommu_pasid28;
506 static int iommu_identity_mapping;
507
508 #define IDENTMAP_ALL            1
509 #define IDENTMAP_GFX            2
510 #define IDENTMAP_AZALIA         4
511
512 /* Broadwell and Skylake have broken ECS support — normal so-called "second
513  * level" translation of DMA requests-without-PASID doesn't actually happen
514  * unless you also set the NESTE bit in an extended context-entry. Which of
515  * course means that SVM doesn't work because it's trying to do nested
516  * translation of the physical addresses it finds in the process page tables,
517  * through the IOVA->phys mapping found in the "second level" page tables.
518  *
519  * The VT-d specification was retroactively changed to change the definition
520  * of the capability bits and pretend that Broadwell/Skylake never happened...
521  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
522  * for some reason it was the PASID capability bit which was redefined (from
523  * bit 28 on BDW/SKL to bit 40 in future).
524  *
525  * So our test for ECS needs to eschew those implementations which set the old
526  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
527  * Unless we are working around the 'pasid28' limitations, that is, by putting
528  * the device into passthrough mode for normal DMA and thus masking the bug.
529  */
530 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
531                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
532 /* PASID support is thus enabled if ECS is enabled and *either* of the old
533  * or new capability bits are set. */
534 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
535                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
536
537 int intel_iommu_gfx_mapped;
538 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
539
540 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
541 static DEFINE_SPINLOCK(device_domain_lock);
542 static LIST_HEAD(device_domain_list);
543
544 static const struct iommu_ops intel_iommu_ops;
545
546 static bool translation_pre_enabled(struct intel_iommu *iommu)
547 {
548         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
549 }
550
551 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
552 {
553         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
554 }
555
556 static void init_translation_status(struct intel_iommu *iommu)
557 {
558         u32 gsts;
559
560         gsts = readl(iommu->reg + DMAR_GSTS_REG);
561         if (gsts & DMA_GSTS_TES)
562                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
563 }
564
565 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
566 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
567 {
568         return container_of(dom, struct dmar_domain, domain);
569 }
570
571 static int __init intel_iommu_setup(char *str)
572 {
573         if (!str)
574                 return -EINVAL;
575         while (*str) {
576                 if (!strncmp(str, "on", 2)) {
577                         dmar_disabled = 0;
578                         pr_info("IOMMU enabled\n");
579                 } else if (!strncmp(str, "off", 3)) {
580                         dmar_disabled = 1;
581                         pr_info("IOMMU disabled\n");
582                 } else if (!strncmp(str, "igfx_off", 8)) {
583                         dmar_map_gfx = 0;
584                         pr_info("Disable GFX device mapping\n");
585                 } else if (!strncmp(str, "forcedac", 8)) {
586                         pr_info("Forcing DAC for PCI devices\n");
587                         dmar_forcedac = 1;
588                 } else if (!strncmp(str, "strict", 6)) {
589                         pr_info("Disable batched IOTLB flush\n");
590                         intel_iommu_strict = 1;
591                 } else if (!strncmp(str, "sp_off", 6)) {
592                         pr_info("Disable supported super page\n");
593                         intel_iommu_superpage = 0;
594                 } else if (!strncmp(str, "ecs_off", 7)) {
595                         printk(KERN_INFO
596                                 "Intel-IOMMU: disable extended context table support\n");
597                         intel_iommu_ecs = 0;
598                 } else if (!strncmp(str, "pasid28", 7)) {
599                         printk(KERN_INFO
600                                 "Intel-IOMMU: enable pre-production PASID support\n");
601                         intel_iommu_pasid28 = 1;
602                         iommu_identity_mapping |= IDENTMAP_GFX;
603                 }
604
605                 str += strcspn(str, ",");
606                 while (*str == ',')
607                         str++;
608         }
609         return 0;
610 }
611 __setup("intel_iommu=", intel_iommu_setup);
612
613 static struct kmem_cache *iommu_domain_cache;
614 static struct kmem_cache *iommu_devinfo_cache;
615
616 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
617 {
618         struct dmar_domain **domains;
619         int idx = did >> 8;
620
621         domains = iommu->domains[idx];
622         if (!domains)
623                 return NULL;
624
625         return domains[did & 0xff];
626 }
627
628 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
629                              struct dmar_domain *domain)
630 {
631         struct dmar_domain **domains;
632         int idx = did >> 8;
633
634         if (!iommu->domains[idx]) {
635                 size_t size = 256 * sizeof(struct dmar_domain *);
636                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
637         }
638
639         domains = iommu->domains[idx];
640         if (WARN_ON(!domains))
641                 return;
642         else
643                 domains[did & 0xff] = domain;
644 }
645
646 static inline void *alloc_pgtable_page(int node)
647 {
648         struct page *page;
649         void *vaddr = NULL;
650
651         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
652         if (page)
653                 vaddr = page_address(page);
654         return vaddr;
655 }
656
657 static inline void free_pgtable_page(void *vaddr)
658 {
659         free_page((unsigned long)vaddr);
660 }
661
662 static inline void *alloc_domain_mem(void)
663 {
664         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
665 }
666
667 static void free_domain_mem(void *vaddr)
668 {
669         kmem_cache_free(iommu_domain_cache, vaddr);
670 }
671
672 static inline void * alloc_devinfo_mem(void)
673 {
674         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
675 }
676
677 static inline void free_devinfo_mem(void *vaddr)
678 {
679         kmem_cache_free(iommu_devinfo_cache, vaddr);
680 }
681
682 static inline int domain_type_is_vm(struct dmar_domain *domain)
683 {
684         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
685 }
686
687 static inline int domain_type_is_si(struct dmar_domain *domain)
688 {
689         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
690 }
691
692 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
693 {
694         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
695                                 DOMAIN_FLAG_STATIC_IDENTITY);
696 }
697
698 static inline int domain_pfn_supported(struct dmar_domain *domain,
699                                        unsigned long pfn)
700 {
701         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
702
703         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
704 }
705
706 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
707 {
708         unsigned long sagaw;
709         int agaw = -1;
710
711         sagaw = cap_sagaw(iommu->cap);
712         for (agaw = width_to_agaw(max_gaw);
713              agaw >= 0; agaw--) {
714                 if (test_bit(agaw, &sagaw))
715                         break;
716         }
717
718         return agaw;
719 }
720
721 /*
722  * Calculate max SAGAW for each iommu.
723  */
724 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
725 {
726         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
727 }
728
729 /*
730  * calculate agaw for each iommu.
731  * "SAGAW" may be different across iommus, use a default agaw, and
732  * get a supported less agaw for iommus that don't support the default agaw.
733  */
734 int iommu_calculate_agaw(struct intel_iommu *iommu)
735 {
736         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
737 }
738
739 /* This functionin only returns single iommu in a domain */
740 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
741 {
742         int iommu_id;
743
744         /* si_domain and vm domain should not get here. */
745         BUG_ON(domain_type_is_vm_or_si(domain));
746         for_each_domain_iommu(iommu_id, domain)
747                 break;
748
749         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
750                 return NULL;
751
752         return g_iommus[iommu_id];
753 }
754
755 static void domain_update_iommu_coherency(struct dmar_domain *domain)
756 {
757         struct dmar_drhd_unit *drhd;
758         struct intel_iommu *iommu;
759         bool found = false;
760         int i;
761
762         domain->iommu_coherency = 1;
763
764         for_each_domain_iommu(i, domain) {
765                 found = true;
766                 if (!ecap_coherent(g_iommus[i]->ecap)) {
767                         domain->iommu_coherency = 0;
768                         break;
769                 }
770         }
771         if (found)
772                 return;
773
774         /* No hardware attached; use lowest common denominator */
775         rcu_read_lock();
776         for_each_active_iommu(iommu, drhd) {
777                 if (!ecap_coherent(iommu->ecap)) {
778                         domain->iommu_coherency = 0;
779                         break;
780                 }
781         }
782         rcu_read_unlock();
783 }
784
785 static int domain_update_iommu_snooping(struct intel_iommu *skip)
786 {
787         struct dmar_drhd_unit *drhd;
788         struct intel_iommu *iommu;
789         int ret = 1;
790
791         rcu_read_lock();
792         for_each_active_iommu(iommu, drhd) {
793                 if (iommu != skip) {
794                         if (!ecap_sc_support(iommu->ecap)) {
795                                 ret = 0;
796                                 break;
797                         }
798                 }
799         }
800         rcu_read_unlock();
801
802         return ret;
803 }
804
805 static int domain_update_iommu_superpage(struct intel_iommu *skip)
806 {
807         struct dmar_drhd_unit *drhd;
808         struct intel_iommu *iommu;
809         int mask = 0xf;
810
811         if (!intel_iommu_superpage) {
812                 return 0;
813         }
814
815         /* set iommu_superpage to the smallest common denominator */
816         rcu_read_lock();
817         for_each_active_iommu(iommu, drhd) {
818                 if (iommu != skip) {
819                         mask &= cap_super_page_val(iommu->cap);
820                         if (!mask)
821                                 break;
822                 }
823         }
824         rcu_read_unlock();
825
826         return fls(mask);
827 }
828
829 /* Some capabilities may be different across iommus */
830 static void domain_update_iommu_cap(struct dmar_domain *domain)
831 {
832         domain_update_iommu_coherency(domain);
833         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
834         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
835 }
836
837 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
838                                                        u8 bus, u8 devfn, int alloc)
839 {
840         struct root_entry *root = &iommu->root_entry[bus];
841         struct context_entry *context;
842         u64 *entry;
843
844         entry = &root->lo;
845         if (ecs_enabled(iommu)) {
846                 if (devfn >= 0x80) {
847                         devfn -= 0x80;
848                         entry = &root->hi;
849                 }
850                 devfn *= 2;
851         }
852         if (*entry & 1)
853                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
854         else {
855                 unsigned long phy_addr;
856                 if (!alloc)
857                         return NULL;
858
859                 context = alloc_pgtable_page(iommu->node);
860                 if (!context)
861                         return NULL;
862
863                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
864                 phy_addr = virt_to_phys((void *)context);
865                 *entry = phy_addr | 1;
866                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
867         }
868         return &context[devfn];
869 }
870
871 static int iommu_dummy(struct device *dev)
872 {
873         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
874 }
875
876 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
877 {
878         struct dmar_drhd_unit *drhd = NULL;
879         struct intel_iommu *iommu;
880         struct device *tmp;
881         struct pci_dev *ptmp, *pdev = NULL;
882         u16 segment = 0;
883         int i;
884
885         if (iommu_dummy(dev))
886                 return NULL;
887
888         if (dev_is_pci(dev)) {
889                 struct pci_dev *pf_pdev;
890
891                 pdev = to_pci_dev(dev);
892                 /* VFs aren't listed in scope tables; we need to look up
893                  * the PF instead to find the IOMMU. */
894                 pf_pdev = pci_physfn(pdev);
895                 dev = &pf_pdev->dev;
896                 segment = pci_domain_nr(pdev->bus);
897         } else if (has_acpi_companion(dev))
898                 dev = &ACPI_COMPANION(dev)->dev;
899
900         rcu_read_lock();
901         for_each_active_iommu(iommu, drhd) {
902                 if (pdev && segment != drhd->segment)
903                         continue;
904
905                 for_each_active_dev_scope(drhd->devices,
906                                           drhd->devices_cnt, i, tmp) {
907                         if (tmp == dev) {
908                                 /* For a VF use its original BDF# not that of the PF
909                                  * which we used for the IOMMU lookup. Strictly speaking
910                                  * we could do this for all PCI devices; we only need to
911                                  * get the BDF# from the scope table for ACPI matches. */
912                                 if (pdev && pdev->is_virtfn)
913                                         goto got_pdev;
914
915                                 *bus = drhd->devices[i].bus;
916                                 *devfn = drhd->devices[i].devfn;
917                                 goto out;
918                         }
919
920                         if (!pdev || !dev_is_pci(tmp))
921                                 continue;
922
923                         ptmp = to_pci_dev(tmp);
924                         if (ptmp->subordinate &&
925                             ptmp->subordinate->number <= pdev->bus->number &&
926                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
927                                 goto got_pdev;
928                 }
929
930                 if (pdev && drhd->include_all) {
931                 got_pdev:
932                         *bus = pdev->bus->number;
933                         *devfn = pdev->devfn;
934                         goto out;
935                 }
936         }
937         iommu = NULL;
938  out:
939         rcu_read_unlock();
940
941         return iommu;
942 }
943
944 static void domain_flush_cache(struct dmar_domain *domain,
945                                void *addr, int size)
946 {
947         if (!domain->iommu_coherency)
948                 clflush_cache_range(addr, size);
949 }
950
951 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
952 {
953         struct context_entry *context;
954         int ret = 0;
955         unsigned long flags;
956
957         spin_lock_irqsave(&iommu->lock, flags);
958         context = iommu_context_addr(iommu, bus, devfn, 0);
959         if (context)
960                 ret = context_present(context);
961         spin_unlock_irqrestore(&iommu->lock, flags);
962         return ret;
963 }
964
965 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
966 {
967         struct context_entry *context;
968         unsigned long flags;
969
970         spin_lock_irqsave(&iommu->lock, flags);
971         context = iommu_context_addr(iommu, bus, devfn, 0);
972         if (context) {
973                 context_clear_entry(context);
974                 __iommu_flush_cache(iommu, context, sizeof(*context));
975         }
976         spin_unlock_irqrestore(&iommu->lock, flags);
977 }
978
979 static void free_context_table(struct intel_iommu *iommu)
980 {
981         int i;
982         unsigned long flags;
983         struct context_entry *context;
984
985         spin_lock_irqsave(&iommu->lock, flags);
986         if (!iommu->root_entry) {
987                 goto out;
988         }
989         for (i = 0; i < ROOT_ENTRY_NR; i++) {
990                 context = iommu_context_addr(iommu, i, 0, 0);
991                 if (context)
992                         free_pgtable_page(context);
993
994                 if (!ecs_enabled(iommu))
995                         continue;
996
997                 context = iommu_context_addr(iommu, i, 0x80, 0);
998                 if (context)
999                         free_pgtable_page(context);
1000
1001         }
1002         free_pgtable_page(iommu->root_entry);
1003         iommu->root_entry = NULL;
1004 out:
1005         spin_unlock_irqrestore(&iommu->lock, flags);
1006 }
1007
1008 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1009                                       unsigned long pfn, int *target_level)
1010 {
1011         struct dma_pte *parent, *pte = NULL;
1012         int level = agaw_to_level(domain->agaw);
1013         int offset;
1014
1015         BUG_ON(!domain->pgd);
1016
1017         if (!domain_pfn_supported(domain, pfn))
1018                 /* Address beyond IOMMU's addressing capabilities. */
1019                 return NULL;
1020
1021         parent = domain->pgd;
1022
1023         while (1) {
1024                 void *tmp_page;
1025
1026                 offset = pfn_level_offset(pfn, level);
1027                 pte = &parent[offset];
1028                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1029                         break;
1030                 if (level == *target_level)
1031                         break;
1032
1033                 if (!dma_pte_present(pte)) {
1034                         uint64_t pteval;
1035
1036                         tmp_page = alloc_pgtable_page(domain->nid);
1037
1038                         if (!tmp_page)
1039                                 return NULL;
1040
1041                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1042                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1043                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1044                                 /* Someone else set it while we were thinking; use theirs. */
1045                                 free_pgtable_page(tmp_page);
1046                         else
1047                                 domain_flush_cache(domain, pte, sizeof(*pte));
1048                 }
1049                 if (level == 1)
1050                         break;
1051
1052                 parent = phys_to_virt(dma_pte_addr(pte));
1053                 level--;
1054         }
1055
1056         if (!*target_level)
1057                 *target_level = level;
1058
1059         return pte;
1060 }
1061
1062
1063 /* return address's pte at specific level */
1064 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1065                                          unsigned long pfn,
1066                                          int level, int *large_page)
1067 {
1068         struct dma_pte *parent, *pte = NULL;
1069         int total = agaw_to_level(domain->agaw);
1070         int offset;
1071
1072         parent = domain->pgd;
1073         while (level <= total) {
1074                 offset = pfn_level_offset(pfn, total);
1075                 pte = &parent[offset];
1076                 if (level == total)
1077                         return pte;
1078
1079                 if (!dma_pte_present(pte)) {
1080                         *large_page = total;
1081                         break;
1082                 }
1083
1084                 if (dma_pte_superpage(pte)) {
1085                         *large_page = total;
1086                         return pte;
1087                 }
1088
1089                 parent = phys_to_virt(dma_pte_addr(pte));
1090                 total--;
1091         }
1092         return NULL;
1093 }
1094
1095 /* clear last level pte, a tlb flush should be followed */
1096 static void dma_pte_clear_range(struct dmar_domain *domain,
1097                                 unsigned long start_pfn,
1098                                 unsigned long last_pfn)
1099 {
1100         unsigned int large_page = 1;
1101         struct dma_pte *first_pte, *pte;
1102
1103         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1104         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1105         BUG_ON(start_pfn > last_pfn);
1106
1107         /* we don't need lock here; nobody else touches the iova range */
1108         do {
1109                 large_page = 1;
1110                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1111                 if (!pte) {
1112                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1113                         continue;
1114                 }
1115                 do {
1116                         dma_clear_pte(pte);
1117                         start_pfn += lvl_to_nr_pages(large_page);
1118                         pte++;
1119                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1120
1121                 domain_flush_cache(domain, first_pte,
1122                                    (void *)pte - (void *)first_pte);
1123
1124         } while (start_pfn && start_pfn <= last_pfn);
1125 }
1126
1127 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1128                                struct dma_pte *pte, unsigned long pfn,
1129                                unsigned long start_pfn, unsigned long last_pfn)
1130 {
1131         pfn = max(start_pfn, pfn);
1132         pte = &pte[pfn_level_offset(pfn, level)];
1133
1134         do {
1135                 unsigned long level_pfn;
1136                 struct dma_pte *level_pte;
1137
1138                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1139                         goto next;
1140
1141                 level_pfn = pfn & level_mask(level);
1142                 level_pte = phys_to_virt(dma_pte_addr(pte));
1143
1144                 if (level > 2)
1145                         dma_pte_free_level(domain, level - 1, level_pte,
1146                                            level_pfn, start_pfn, last_pfn);
1147
1148                 /* If range covers entire pagetable, free it */
1149                 if (!(start_pfn > level_pfn ||
1150                       last_pfn < level_pfn + level_size(level) - 1)) {
1151                         dma_clear_pte(pte);
1152                         domain_flush_cache(domain, pte, sizeof(*pte));
1153                         free_pgtable_page(level_pte);
1154                 }
1155 next:
1156                 pfn += level_size(level);
1157         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1158 }
1159
1160 /* free page table pages. last level pte should already be cleared */
1161 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1162                                    unsigned long start_pfn,
1163                                    unsigned long last_pfn)
1164 {
1165         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1166         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1167         BUG_ON(start_pfn > last_pfn);
1168
1169         dma_pte_clear_range(domain, start_pfn, last_pfn);
1170
1171         /* We don't need lock here; nobody else touches the iova range */
1172         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1173                            domain->pgd, 0, start_pfn, last_pfn);
1174
1175         /* free pgd */
1176         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1177                 free_pgtable_page(domain->pgd);
1178                 domain->pgd = NULL;
1179         }
1180 }
1181
1182 /* When a page at a given level is being unlinked from its parent, we don't
1183    need to *modify* it at all. All we need to do is make a list of all the
1184    pages which can be freed just as soon as we've flushed the IOTLB and we
1185    know the hardware page-walk will no longer touch them.
1186    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1187    be freed. */
1188 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1189                                             int level, struct dma_pte *pte,
1190                                             struct page *freelist)
1191 {
1192         struct page *pg;
1193
1194         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1195         pg->freelist = freelist;
1196         freelist = pg;
1197
1198         if (level == 1)
1199                 return freelist;
1200
1201         pte = page_address(pg);
1202         do {
1203                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1204                         freelist = dma_pte_list_pagetables(domain, level - 1,
1205                                                            pte, freelist);
1206                 pte++;
1207         } while (!first_pte_in_page(pte));
1208
1209         return freelist;
1210 }
1211
1212 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1213                                         struct dma_pte *pte, unsigned long pfn,
1214                                         unsigned long start_pfn,
1215                                         unsigned long last_pfn,
1216                                         struct page *freelist)
1217 {
1218         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1219
1220         pfn = max(start_pfn, pfn);
1221         pte = &pte[pfn_level_offset(pfn, level)];
1222
1223         do {
1224                 unsigned long level_pfn;
1225
1226                 if (!dma_pte_present(pte))
1227                         goto next;
1228
1229                 level_pfn = pfn & level_mask(level);
1230
1231                 /* If range covers entire pagetable, free it */
1232                 if (start_pfn <= level_pfn &&
1233                     last_pfn >= level_pfn + level_size(level) - 1) {
1234                         /* These suborbinate page tables are going away entirely. Don't
1235                            bother to clear them; we're just going to *free* them. */
1236                         if (level > 1 && !dma_pte_superpage(pte))
1237                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1238
1239                         dma_clear_pte(pte);
1240                         if (!first_pte)
1241                                 first_pte = pte;
1242                         last_pte = pte;
1243                 } else if (level > 1) {
1244                         /* Recurse down into a level that isn't *entirely* obsolete */
1245                         freelist = dma_pte_clear_level(domain, level - 1,
1246                                                        phys_to_virt(dma_pte_addr(pte)),
1247                                                        level_pfn, start_pfn, last_pfn,
1248                                                        freelist);
1249                 }
1250 next:
1251                 pfn += level_size(level);
1252         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1253
1254         if (first_pte)
1255                 domain_flush_cache(domain, first_pte,
1256                                    (void *)++last_pte - (void *)first_pte);
1257
1258         return freelist;
1259 }
1260
1261 /* We can't just free the pages because the IOMMU may still be walking
1262    the page tables, and may have cached the intermediate levels. The
1263    pages can only be freed after the IOTLB flush has been done. */
1264 static struct page *domain_unmap(struct dmar_domain *domain,
1265                                  unsigned long start_pfn,
1266                                  unsigned long last_pfn)
1267 {
1268         struct page *freelist = NULL;
1269
1270         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1271         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1272         BUG_ON(start_pfn > last_pfn);
1273
1274         /* we don't need lock here; nobody else touches the iova range */
1275         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1276                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1277
1278         /* free pgd */
1279         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1280                 struct page *pgd_page = virt_to_page(domain->pgd);
1281                 pgd_page->freelist = freelist;
1282                 freelist = pgd_page;
1283
1284                 domain->pgd = NULL;
1285         }
1286
1287         return freelist;
1288 }
1289
1290 static void dma_free_pagelist(struct page *freelist)
1291 {
1292         struct page *pg;
1293
1294         while ((pg = freelist)) {
1295                 freelist = pg->freelist;
1296                 free_pgtable_page(page_address(pg));
1297         }
1298 }
1299
1300 /* iommu handling */
1301 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1302 {
1303         struct root_entry *root;
1304         unsigned long flags;
1305
1306         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1307         if (!root) {
1308                 pr_err("Allocating root entry for %s failed\n",
1309                         iommu->name);
1310                 return -ENOMEM;
1311         }
1312
1313         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1314
1315         spin_lock_irqsave(&iommu->lock, flags);
1316         iommu->root_entry = root;
1317         spin_unlock_irqrestore(&iommu->lock, flags);
1318
1319         return 0;
1320 }
1321
1322 static void iommu_set_root_entry(struct intel_iommu *iommu)
1323 {
1324         u64 addr;
1325         u32 sts;
1326         unsigned long flag;
1327
1328         addr = virt_to_phys(iommu->root_entry);
1329         if (ecs_enabled(iommu))
1330                 addr |= DMA_RTADDR_RTT;
1331
1332         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1333         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1334
1335         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1336
1337         /* Make sure hardware complete it */
1338         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1339                       readl, (sts & DMA_GSTS_RTPS), sts);
1340
1341         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1342 }
1343
1344 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1345 {
1346         u32 val;
1347         unsigned long flag;
1348
1349         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1350                 return;
1351
1352         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1353         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1354
1355         /* Make sure hardware complete it */
1356         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1357                       readl, (!(val & DMA_GSTS_WBFS)), val);
1358
1359         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1360 }
1361
1362 /* return value determine if we need a write buffer flush */
1363 static void __iommu_flush_context(struct intel_iommu *iommu,
1364                                   u16 did, u16 source_id, u8 function_mask,
1365                                   u64 type)
1366 {
1367         u64 val = 0;
1368         unsigned long flag;
1369
1370         switch (type) {
1371         case DMA_CCMD_GLOBAL_INVL:
1372                 val = DMA_CCMD_GLOBAL_INVL;
1373                 break;
1374         case DMA_CCMD_DOMAIN_INVL:
1375                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1376                 break;
1377         case DMA_CCMD_DEVICE_INVL:
1378                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1379                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1380                 break;
1381         default:
1382                 BUG();
1383         }
1384         val |= DMA_CCMD_ICC;
1385
1386         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1387         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1388
1389         /* Make sure hardware complete it */
1390         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1391                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1392
1393         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1394 }
1395
1396 /* return value determine if we need a write buffer flush */
1397 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1398                                 u64 addr, unsigned int size_order, u64 type)
1399 {
1400         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1401         u64 val = 0, val_iva = 0;
1402         unsigned long flag;
1403
1404         switch (type) {
1405         case DMA_TLB_GLOBAL_FLUSH:
1406                 /* global flush doesn't need set IVA_REG */
1407                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1408                 break;
1409         case DMA_TLB_DSI_FLUSH:
1410                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1411                 break;
1412         case DMA_TLB_PSI_FLUSH:
1413                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1414                 /* IH bit is passed in as part of address */
1415                 val_iva = size_order | addr;
1416                 break;
1417         default:
1418                 BUG();
1419         }
1420         /* Note: set drain read/write */
1421 #if 0
1422         /*
1423          * This is probably to be super secure.. Looks like we can
1424          * ignore it without any impact.
1425          */
1426         if (cap_read_drain(iommu->cap))
1427                 val |= DMA_TLB_READ_DRAIN;
1428 #endif
1429         if (cap_write_drain(iommu->cap))
1430                 val |= DMA_TLB_WRITE_DRAIN;
1431
1432         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1433         /* Note: Only uses first TLB reg currently */
1434         if (val_iva)
1435                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1436         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1437
1438         /* Make sure hardware complete it */
1439         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1440                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1441
1442         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1443
1444         /* check IOTLB invalidation granularity */
1445         if (DMA_TLB_IAIG(val) == 0)
1446                 pr_err("Flush IOTLB failed\n");
1447         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1448                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1449                         (unsigned long long)DMA_TLB_IIRG(type),
1450                         (unsigned long long)DMA_TLB_IAIG(val));
1451 }
1452
1453 static struct device_domain_info *
1454 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1455                          u8 bus, u8 devfn)
1456 {
1457         struct device_domain_info *info;
1458
1459         assert_spin_locked(&device_domain_lock);
1460
1461         if (!iommu->qi)
1462                 return NULL;
1463
1464         list_for_each_entry(info, &domain->devices, link)
1465                 if (info->iommu == iommu && info->bus == bus &&
1466                     info->devfn == devfn) {
1467                         if (info->ats_supported && info->dev)
1468                                 return info;
1469                         break;
1470                 }
1471
1472         return NULL;
1473 }
1474
1475 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1476 {
1477         struct pci_dev *pdev;
1478
1479         if (!info || !dev_is_pci(info->dev))
1480                 return;
1481
1482         pdev = to_pci_dev(info->dev);
1483         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1484          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1485          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1486          * reserved, which should be set to 0.
1487          */
1488         if (!ecap_dit(info->iommu->ecap))
1489                 info->pfsid = 0;
1490         else {
1491                 struct pci_dev *pf_pdev;
1492
1493                 /* pdev will be returned if device is not a vf */
1494                 pf_pdev = pci_physfn(pdev);
1495                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1496         }
1497
1498 #ifdef CONFIG_INTEL_IOMMU_SVM
1499         /* The PCIe spec, in its wisdom, declares that the behaviour of
1500            the device if you enable PASID support after ATS support is
1501            undefined. So always enable PASID support on devices which
1502            have it, even if we can't yet know if we're ever going to
1503            use it. */
1504         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1505                 info->pasid_enabled = 1;
1506
1507         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1508                 info->pri_enabled = 1;
1509 #endif
1510         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1511                 info->ats_enabled = 1;
1512                 info->ats_qdep = pci_ats_queue_depth(pdev);
1513         }
1514 }
1515
1516 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1517 {
1518         struct pci_dev *pdev;
1519
1520         if (!dev_is_pci(info->dev))
1521                 return;
1522
1523         pdev = to_pci_dev(info->dev);
1524
1525         if (info->ats_enabled) {
1526                 pci_disable_ats(pdev);
1527                 info->ats_enabled = 0;
1528         }
1529 #ifdef CONFIG_INTEL_IOMMU_SVM
1530         if (info->pri_enabled) {
1531                 pci_disable_pri(pdev);
1532                 info->pri_enabled = 0;
1533         }
1534         if (info->pasid_enabled) {
1535                 pci_disable_pasid(pdev);
1536                 info->pasid_enabled = 0;
1537         }
1538 #endif
1539 }
1540
1541 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1542                                   u64 addr, unsigned mask)
1543 {
1544         u16 sid, qdep;
1545         unsigned long flags;
1546         struct device_domain_info *info;
1547
1548         spin_lock_irqsave(&device_domain_lock, flags);
1549         list_for_each_entry(info, &domain->devices, link) {
1550                 if (!info->ats_enabled)
1551                         continue;
1552
1553                 sid = info->bus << 8 | info->devfn;
1554                 qdep = info->ats_qdep;
1555                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1556                                 qdep, addr, mask);
1557         }
1558         spin_unlock_irqrestore(&device_domain_lock, flags);
1559 }
1560
1561 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1562                                   struct dmar_domain *domain,
1563                                   unsigned long pfn, unsigned int pages,
1564                                   int ih, int map)
1565 {
1566         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1567         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1568         u16 did = domain->iommu_did[iommu->seq_id];
1569
1570         BUG_ON(pages == 0);
1571
1572         if (ih)
1573                 ih = 1 << 6;
1574         /*
1575          * Fallback to domain selective flush if no PSI support or the size is
1576          * too big.
1577          * PSI requires page size to be 2 ^ x, and the base address is naturally
1578          * aligned to the size
1579          */
1580         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1581                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1582                                                 DMA_TLB_DSI_FLUSH);
1583         else
1584                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1585                                                 DMA_TLB_PSI_FLUSH);
1586
1587         /*
1588          * In caching mode, changes of pages from non-present to present require
1589          * flush. However, device IOTLB doesn't need to be flushed in this case.
1590          */
1591         if (!cap_caching_mode(iommu->cap) || !map)
1592                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1593                                       addr, mask);
1594 }
1595
1596 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597 {
1598         u32 pmen;
1599         unsigned long flags;
1600
1601         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602                 return;
1603
1604         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606         pmen &= ~DMA_PMEN_EPM;
1607         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608
1609         /* wait for the protected region status bit to clear */
1610         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1612
1613         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614 }
1615
1616 static void iommu_enable_translation(struct intel_iommu *iommu)
1617 {
1618         u32 sts;
1619         unsigned long flags;
1620
1621         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622         iommu->gcmd |= DMA_GCMD_TE;
1623         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624
1625         /* Make sure hardware complete it */
1626         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627                       readl, (sts & DMA_GSTS_TES), sts);
1628
1629         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631
1632 static void iommu_disable_translation(struct intel_iommu *iommu)
1633 {
1634         u32 sts;
1635         unsigned long flag;
1636
1637         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1638         iommu->gcmd &= ~DMA_GCMD_TE;
1639         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640
1641         /* Make sure hardware complete it */
1642         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643                       readl, (!(sts & DMA_GSTS_TES)), sts);
1644
1645         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1646 }
1647
1648
1649 static int iommu_init_domains(struct intel_iommu *iommu)
1650 {
1651         u32 ndomains, nlongs;
1652         size_t size;
1653
1654         ndomains = cap_ndoms(iommu->cap);
1655         pr_debug("%s: Number of Domains supported <%d>\n",
1656                  iommu->name, ndomains);
1657         nlongs = BITS_TO_LONGS(ndomains);
1658
1659         spin_lock_init(&iommu->lock);
1660
1661         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1662         if (!iommu->domain_ids) {
1663                 pr_err("%s: Allocating domain id array failed\n",
1664                        iommu->name);
1665                 return -ENOMEM;
1666         }
1667
1668         size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
1669         iommu->domains = kzalloc(size, GFP_KERNEL);
1670
1671         if (iommu->domains) {
1672                 size = 256 * sizeof(struct dmar_domain *);
1673                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1674         }
1675
1676         if (!iommu->domains || !iommu->domains[0]) {
1677                 pr_err("%s: Allocating domain array failed\n",
1678                        iommu->name);
1679                 kfree(iommu->domain_ids);
1680                 kfree(iommu->domains);
1681                 iommu->domain_ids = NULL;
1682                 iommu->domains    = NULL;
1683                 return -ENOMEM;
1684         }
1685
1686
1687
1688         /*
1689          * If Caching mode is set, then invalid translations are tagged
1690          * with domain-id 0, hence we need to pre-allocate it. We also
1691          * use domain-id 0 as a marker for non-allocated domain-id, so
1692          * make sure it is not used for a real domain.
1693          */
1694         set_bit(0, iommu->domain_ids);
1695
1696         return 0;
1697 }
1698
1699 static void disable_dmar_iommu(struct intel_iommu *iommu)
1700 {
1701         struct device_domain_info *info, *tmp;
1702         unsigned long flags;
1703
1704         if (!iommu->domains || !iommu->domain_ids)
1705                 return;
1706
1707 again:
1708         spin_lock_irqsave(&device_domain_lock, flags);
1709         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1710                 struct dmar_domain *domain;
1711
1712                 if (info->iommu != iommu)
1713                         continue;
1714
1715                 if (!info->dev || !info->domain)
1716                         continue;
1717
1718                 domain = info->domain;
1719
1720                 __dmar_remove_one_dev_info(info);
1721
1722                 if (!domain_type_is_vm_or_si(domain)) {
1723                         /*
1724                          * The domain_exit() function  can't be called under
1725                          * device_domain_lock, as it takes this lock itself.
1726                          * So release the lock here and re-run the loop
1727                          * afterwards.
1728                          */
1729                         spin_unlock_irqrestore(&device_domain_lock, flags);
1730                         domain_exit(domain);
1731                         goto again;
1732                 }
1733         }
1734         spin_unlock_irqrestore(&device_domain_lock, flags);
1735
1736         if (iommu->gcmd & DMA_GCMD_TE)
1737                 iommu_disable_translation(iommu);
1738 }
1739
1740 static void free_dmar_iommu(struct intel_iommu *iommu)
1741 {
1742         if ((iommu->domains) && (iommu->domain_ids)) {
1743                 int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
1744                 int i;
1745
1746                 for (i = 0; i < elems; i++)
1747                         kfree(iommu->domains[i]);
1748                 kfree(iommu->domains);
1749                 kfree(iommu->domain_ids);
1750                 iommu->domains = NULL;
1751                 iommu->domain_ids = NULL;
1752         }
1753
1754         g_iommus[iommu->seq_id] = NULL;
1755
1756         /* free context mapping */
1757         free_context_table(iommu);
1758
1759 #ifdef CONFIG_INTEL_IOMMU_SVM
1760         if (pasid_enabled(iommu)) {
1761                 if (ecap_prs(iommu->ecap))
1762                         intel_svm_finish_prq(iommu);
1763                 intel_svm_free_pasid_tables(iommu);
1764         }
1765 #endif
1766 }
1767
1768 static struct dmar_domain *alloc_domain(int flags)
1769 {
1770         struct dmar_domain *domain;
1771
1772         domain = alloc_domain_mem();
1773         if (!domain)
1774                 return NULL;
1775
1776         memset(domain, 0, sizeof(*domain));
1777         domain->nid = -1;
1778         domain->flags = flags;
1779         INIT_LIST_HEAD(&domain->devices);
1780
1781         return domain;
1782 }
1783
1784 /* Must be called with iommu->lock */
1785 static int domain_attach_iommu(struct dmar_domain *domain,
1786                                struct intel_iommu *iommu)
1787 {
1788         unsigned long ndomains;
1789         int num;
1790
1791         assert_spin_locked(&device_domain_lock);
1792         assert_spin_locked(&iommu->lock);
1793
1794         domain->iommu_refcnt[iommu->seq_id] += 1;
1795         domain->iommu_count += 1;
1796         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1797                 ndomains = cap_ndoms(iommu->cap);
1798                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1799
1800                 if (num >= ndomains) {
1801                         pr_err("%s: No free domain ids\n", iommu->name);
1802                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1803                         domain->iommu_count -= 1;
1804                         return -ENOSPC;
1805                 }
1806
1807                 set_bit(num, iommu->domain_ids);
1808                 set_iommu_domain(iommu, num, domain);
1809
1810                 domain->iommu_did[iommu->seq_id] = num;
1811                 domain->nid                      = iommu->node;
1812
1813                 domain_update_iommu_cap(domain);
1814         }
1815
1816         return 0;
1817 }
1818
1819 static int domain_detach_iommu(struct dmar_domain *domain,
1820                                struct intel_iommu *iommu)
1821 {
1822         int num, count = INT_MAX;
1823
1824         assert_spin_locked(&device_domain_lock);
1825         assert_spin_locked(&iommu->lock);
1826
1827         domain->iommu_refcnt[iommu->seq_id] -= 1;
1828         count = --domain->iommu_count;
1829         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1830                 num = domain->iommu_did[iommu->seq_id];
1831                 clear_bit(num, iommu->domain_ids);
1832                 set_iommu_domain(iommu, num, NULL);
1833
1834                 domain_update_iommu_cap(domain);
1835                 domain->iommu_did[iommu->seq_id] = 0;
1836         }
1837
1838         return count;
1839 }
1840
1841 static struct iova_domain reserved_iova_list;
1842 static struct lock_class_key reserved_rbtree_key;
1843
1844 static int dmar_init_reserved_ranges(void)
1845 {
1846         struct pci_dev *pdev = NULL;
1847         struct iova *iova;
1848         int i;
1849
1850         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1851                         DMA_32BIT_PFN);
1852
1853         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1854                 &reserved_rbtree_key);
1855
1856         /* IOAPIC ranges shouldn't be accessed by DMA */
1857         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1858                 IOVA_PFN(IOAPIC_RANGE_END));
1859         if (!iova) {
1860                 pr_err("Reserve IOAPIC range failed\n");
1861                 return -ENODEV;
1862         }
1863
1864         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1865         for_each_pci_dev(pdev) {
1866                 struct resource *r;
1867
1868                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1869                         r = &pdev->resource[i];
1870                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1871                                 continue;
1872                         iova = reserve_iova(&reserved_iova_list,
1873                                             IOVA_PFN(r->start),
1874                                             IOVA_PFN(r->end));
1875                         if (!iova) {
1876                                 pr_err("Reserve iova failed\n");
1877                                 return -ENODEV;
1878                         }
1879                 }
1880         }
1881         return 0;
1882 }
1883
1884 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1885 {
1886         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1887 }
1888
1889 static inline int guestwidth_to_adjustwidth(int gaw)
1890 {
1891         int agaw;
1892         int r = (gaw - 12) % 9;
1893
1894         if (r == 0)
1895                 agaw = gaw;
1896         else
1897                 agaw = gaw + 9 - r;
1898         if (agaw > 64)
1899                 agaw = 64;
1900         return agaw;
1901 }
1902
1903 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1904                        int guest_width)
1905 {
1906         int adjust_width, agaw;
1907         unsigned long sagaw;
1908
1909         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1910                         DMA_32BIT_PFN);
1911         domain_reserve_special_ranges(domain);
1912
1913         /* calculate AGAW */
1914         if (guest_width > cap_mgaw(iommu->cap))
1915                 guest_width = cap_mgaw(iommu->cap);
1916         domain->gaw = guest_width;
1917         adjust_width = guestwidth_to_adjustwidth(guest_width);
1918         agaw = width_to_agaw(adjust_width);
1919         sagaw = cap_sagaw(iommu->cap);
1920         if (!test_bit(agaw, &sagaw)) {
1921                 /* hardware doesn't support it, choose a bigger one */
1922                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1923                 agaw = find_next_bit(&sagaw, 5, agaw);
1924                 if (agaw >= 5)
1925                         return -ENODEV;
1926         }
1927         domain->agaw = agaw;
1928
1929         if (ecap_coherent(iommu->ecap))
1930                 domain->iommu_coherency = 1;
1931         else
1932                 domain->iommu_coherency = 0;
1933
1934         if (ecap_sc_support(iommu->ecap))
1935                 domain->iommu_snooping = 1;
1936         else
1937                 domain->iommu_snooping = 0;
1938
1939         if (intel_iommu_superpage)
1940                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1941         else
1942                 domain->iommu_superpage = 0;
1943
1944         domain->nid = iommu->node;
1945
1946         /* always allocate the top pgd */
1947         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1948         if (!domain->pgd)
1949                 return -ENOMEM;
1950         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1951         return 0;
1952 }
1953
1954 static void domain_exit(struct dmar_domain *domain)
1955 {
1956         struct page *freelist = NULL;
1957
1958         /* Domain 0 is reserved, so dont process it */
1959         if (!domain)
1960                 return;
1961
1962         /* Flush any lazy unmaps that may reference this domain */
1963         if (!intel_iommu_strict)
1964                 flush_unmaps_timeout(0);
1965
1966         /* Remove associated devices and clear attached or cached domains */
1967         rcu_read_lock();
1968         domain_remove_dev_info(domain);
1969         rcu_read_unlock();
1970
1971         /* destroy iovas */
1972         put_iova_domain(&domain->iovad);
1973
1974         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1975
1976         dma_free_pagelist(freelist);
1977
1978         free_domain_mem(domain);
1979 }
1980
1981 static int domain_context_mapping_one(struct dmar_domain *domain,
1982                                       struct intel_iommu *iommu,
1983                                       u8 bus, u8 devfn)
1984 {
1985         u16 did = domain->iommu_did[iommu->seq_id];
1986         int translation = CONTEXT_TT_MULTI_LEVEL;
1987         struct device_domain_info *info = NULL;
1988         struct context_entry *context;
1989         unsigned long flags;
1990         struct dma_pte *pgd;
1991         int ret, agaw;
1992
1993         WARN_ON(did == 0);
1994
1995         if (hw_pass_through && domain_type_is_si(domain))
1996                 translation = CONTEXT_TT_PASS_THROUGH;
1997
1998         pr_debug("Set context mapping for %02x:%02x.%d\n",
1999                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2000
2001         BUG_ON(!domain->pgd);
2002
2003         spin_lock_irqsave(&device_domain_lock, flags);
2004         spin_lock(&iommu->lock);
2005
2006         ret = -ENOMEM;
2007         context = iommu_context_addr(iommu, bus, devfn, 1);
2008         if (!context)
2009                 goto out_unlock;
2010
2011         ret = 0;
2012         if (context_present(context))
2013                 goto out_unlock;
2014
2015         /*
2016          * For kdump cases, old valid entries may be cached due to the
2017          * in-flight DMA and copied pgtable, but there is no unmapping
2018          * behaviour for them, thus we need an explicit cache flush for
2019          * the newly-mapped device. For kdump, at this point, the device
2020          * is supposed to finish reset at its driver probe stage, so no
2021          * in-flight DMA will exist, and we don't need to worry anymore
2022          * hereafter.
2023          */
2024         if (context_copied(context)) {
2025                 u16 did_old = context_domain_id(context);
2026
2027                 if (did_old >= 0 && did_old < cap_ndoms(iommu->cap)) {
2028                         iommu->flush.flush_context(iommu, did_old,
2029                                                    (((u16)bus) << 8) | devfn,
2030                                                    DMA_CCMD_MASK_NOBIT,
2031                                                    DMA_CCMD_DEVICE_INVL);
2032                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2033                                                  DMA_TLB_DSI_FLUSH);
2034                 }
2035         }
2036
2037         pgd = domain->pgd;
2038
2039         context_clear_entry(context);
2040         context_set_domain_id(context, did);
2041
2042         /*
2043          * Skip top levels of page tables for iommu which has less agaw
2044          * than default.  Unnecessary for PT mode.
2045          */
2046         if (translation != CONTEXT_TT_PASS_THROUGH) {
2047                 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2048                         ret = -ENOMEM;
2049                         pgd = phys_to_virt(dma_pte_addr(pgd));
2050                         if (!dma_pte_present(pgd))
2051                                 goto out_unlock;
2052                 }
2053
2054                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2055                 if (info && info->ats_supported)
2056                         translation = CONTEXT_TT_DEV_IOTLB;
2057                 else
2058                         translation = CONTEXT_TT_MULTI_LEVEL;
2059
2060                 context_set_address_root(context, virt_to_phys(pgd));
2061                 context_set_address_width(context, agaw);
2062         } else {
2063                 /*
2064                  * In pass through mode, AW must be programmed to
2065                  * indicate the largest AGAW value supported by
2066                  * hardware. And ASR is ignored by hardware.
2067                  */
2068                 context_set_address_width(context, iommu->msagaw);
2069         }
2070
2071         context_set_translation_type(context, translation);
2072         context_set_fault_enable(context);
2073         context_set_present(context);
2074         domain_flush_cache(domain, context, sizeof(*context));
2075
2076         /*
2077          * It's a non-present to present mapping. If hardware doesn't cache
2078          * non-present entry we only need to flush the write-buffer. If the
2079          * _does_ cache non-present entries, then it does so in the special
2080          * domain #0, which we have to flush:
2081          */
2082         if (cap_caching_mode(iommu->cap)) {
2083                 iommu->flush.flush_context(iommu, 0,
2084                                            (((u16)bus) << 8) | devfn,
2085                                            DMA_CCMD_MASK_NOBIT,
2086                                            DMA_CCMD_DEVICE_INVL);
2087                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2088         } else {
2089                 iommu_flush_write_buffer(iommu);
2090         }
2091         iommu_enable_dev_iotlb(info);
2092
2093         ret = 0;
2094
2095 out_unlock:
2096         spin_unlock(&iommu->lock);
2097         spin_unlock_irqrestore(&device_domain_lock, flags);
2098
2099         return ret;
2100 }
2101
2102 struct domain_context_mapping_data {
2103         struct dmar_domain *domain;
2104         struct intel_iommu *iommu;
2105 };
2106
2107 static int domain_context_mapping_cb(struct pci_dev *pdev,
2108                                      u16 alias, void *opaque)
2109 {
2110         struct domain_context_mapping_data *data = opaque;
2111
2112         return domain_context_mapping_one(data->domain, data->iommu,
2113                                           PCI_BUS_NUM(alias), alias & 0xff);
2114 }
2115
2116 static int
2117 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2118 {
2119         struct intel_iommu *iommu;
2120         u8 bus, devfn;
2121         struct domain_context_mapping_data data;
2122
2123         iommu = device_to_iommu(dev, &bus, &devfn);
2124         if (!iommu)
2125                 return -ENODEV;
2126
2127         if (!dev_is_pci(dev))
2128                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2129
2130         data.domain = domain;
2131         data.iommu = iommu;
2132
2133         return pci_for_each_dma_alias(to_pci_dev(dev),
2134                                       &domain_context_mapping_cb, &data);
2135 }
2136
2137 static int domain_context_mapped_cb(struct pci_dev *pdev,
2138                                     u16 alias, void *opaque)
2139 {
2140         struct intel_iommu *iommu = opaque;
2141
2142         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2143 }
2144
2145 static int domain_context_mapped(struct device *dev)
2146 {
2147         struct intel_iommu *iommu;
2148         u8 bus, devfn;
2149
2150         iommu = device_to_iommu(dev, &bus, &devfn);
2151         if (!iommu)
2152                 return -ENODEV;
2153
2154         if (!dev_is_pci(dev))
2155                 return device_context_mapped(iommu, bus, devfn);
2156
2157         return !pci_for_each_dma_alias(to_pci_dev(dev),
2158                                        domain_context_mapped_cb, iommu);
2159 }
2160
2161 /* Returns a number of VTD pages, but aligned to MM page size */
2162 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2163                                             size_t size)
2164 {
2165         host_addr &= ~PAGE_MASK;
2166         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2167 }
2168
2169 /* Return largest possible superpage level for a given mapping */
2170 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2171                                           unsigned long iov_pfn,
2172                                           unsigned long phy_pfn,
2173                                           unsigned long pages)
2174 {
2175         int support, level = 1;
2176         unsigned long pfnmerge;
2177
2178         support = domain->iommu_superpage;
2179
2180         /* To use a large page, the virtual *and* physical addresses
2181            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2182            of them will mean we have to use smaller pages. So just
2183            merge them and check both at once. */
2184         pfnmerge = iov_pfn | phy_pfn;
2185
2186         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2187                 pages >>= VTD_STRIDE_SHIFT;
2188                 if (!pages)
2189                         break;
2190                 pfnmerge >>= VTD_STRIDE_SHIFT;
2191                 level++;
2192                 support--;
2193         }
2194         return level;
2195 }
2196
2197 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2198                             struct scatterlist *sg, unsigned long phys_pfn,
2199                             unsigned long nr_pages, int prot)
2200 {
2201         struct dma_pte *first_pte = NULL, *pte = NULL;
2202         phys_addr_t uninitialized_var(pteval);
2203         unsigned long sg_res = 0;
2204         unsigned int largepage_lvl = 0;
2205         unsigned long lvl_pages = 0;
2206
2207         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2208
2209         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2210                 return -EINVAL;
2211
2212         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2213
2214         if (!sg) {
2215                 sg_res = nr_pages;
2216                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2217         }
2218
2219         while (nr_pages > 0) {
2220                 uint64_t tmp;
2221
2222                 if (!sg_res) {
2223                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2224
2225                         sg_res = aligned_nrpages(sg->offset, sg->length);
2226                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2227                         sg->dma_length = sg->length;
2228                         pteval = (sg_phys(sg) - pgoff) | prot;
2229                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2230                 }
2231
2232                 if (!pte) {
2233                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2234
2235                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2236                         if (!pte)
2237                                 return -ENOMEM;
2238                         /* It is large page*/
2239                         if (largepage_lvl > 1) {
2240                                 unsigned long nr_superpages, end_pfn;
2241
2242                                 pteval |= DMA_PTE_LARGE_PAGE;
2243                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2244
2245                                 nr_superpages = sg_res / lvl_pages;
2246                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2247
2248                                 /*
2249                                  * Ensure that old small page tables are
2250                                  * removed to make room for superpage(s).
2251                                  */
2252                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
2253                         } else {
2254                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2255                         }
2256
2257                 }
2258                 /* We don't need lock here, nobody else
2259                  * touches the iova range
2260                  */
2261                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2262                 if (tmp) {
2263                         static int dumps = 5;
2264                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2265                                 iov_pfn, tmp, (unsigned long long)pteval);
2266                         if (dumps) {
2267                                 dumps--;
2268                                 debug_dma_dump_mappings(NULL);
2269                         }
2270                         WARN_ON(1);
2271                 }
2272
2273                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2274
2275                 BUG_ON(nr_pages < lvl_pages);
2276                 BUG_ON(sg_res < lvl_pages);
2277
2278                 nr_pages -= lvl_pages;
2279                 iov_pfn += lvl_pages;
2280                 phys_pfn += lvl_pages;
2281                 pteval += lvl_pages * VTD_PAGE_SIZE;
2282                 sg_res -= lvl_pages;
2283
2284                 /* If the next PTE would be the first in a new page, then we
2285                    need to flush the cache on the entries we've just written.
2286                    And then we'll need to recalculate 'pte', so clear it and
2287                    let it get set again in the if (!pte) block above.
2288
2289                    If we're done (!nr_pages) we need to flush the cache too.
2290
2291                    Also if we've been setting superpages, we may need to
2292                    recalculate 'pte' and switch back to smaller pages for the
2293                    end of the mapping, if the trailing size is not enough to
2294                    use another superpage (i.e. sg_res < lvl_pages). */
2295                 pte++;
2296                 if (!nr_pages || first_pte_in_page(pte) ||
2297                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2298                         domain_flush_cache(domain, first_pte,
2299                                            (void *)pte - (void *)first_pte);
2300                         pte = NULL;
2301                 }
2302
2303                 if (!sg_res && nr_pages)
2304                         sg = sg_next(sg);
2305         }
2306         return 0;
2307 }
2308
2309 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2310                                     struct scatterlist *sg, unsigned long nr_pages,
2311                                     int prot)
2312 {
2313         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2314 }
2315
2316 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2317                                      unsigned long phys_pfn, unsigned long nr_pages,
2318                                      int prot)
2319 {
2320         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2321 }
2322
2323 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2324 {
2325         if (!iommu)
2326                 return;
2327
2328         clear_context_table(iommu, bus, devfn);
2329         iommu->flush.flush_context(iommu, 0, 0, 0,
2330                                            DMA_CCMD_GLOBAL_INVL);
2331         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2332 }
2333
2334 static inline void unlink_domain_info(struct device_domain_info *info)
2335 {
2336         assert_spin_locked(&device_domain_lock);
2337         list_del(&info->link);
2338         list_del(&info->global);
2339         if (info->dev)
2340                 info->dev->archdata.iommu = NULL;
2341 }
2342
2343 static void domain_remove_dev_info(struct dmar_domain *domain)
2344 {
2345         struct device_domain_info *info, *tmp;
2346         unsigned long flags;
2347
2348         spin_lock_irqsave(&device_domain_lock, flags);
2349         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2350                 __dmar_remove_one_dev_info(info);
2351         spin_unlock_irqrestore(&device_domain_lock, flags);
2352 }
2353
2354 /*
2355  * find_domain
2356  * Note: we use struct device->archdata.iommu stores the info
2357  */
2358 static struct dmar_domain *find_domain(struct device *dev)
2359 {
2360         struct device_domain_info *info;
2361
2362         /* No lock here, assumes no domain exit in normal case */
2363         info = dev->archdata.iommu;
2364         if (info)
2365                 return info->domain;
2366         return NULL;
2367 }
2368
2369 static inline struct device_domain_info *
2370 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2371 {
2372         struct device_domain_info *info;
2373
2374         list_for_each_entry(info, &device_domain_list, global)
2375                 if (info->iommu->segment == segment && info->bus == bus &&
2376                     info->devfn == devfn)
2377                         return info;
2378
2379         return NULL;
2380 }
2381
2382 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2383                                                     int bus, int devfn,
2384                                                     struct device *dev,
2385                                                     struct dmar_domain *domain)
2386 {
2387         struct dmar_domain *found = NULL;
2388         struct device_domain_info *info;
2389         unsigned long flags;
2390         int ret;
2391
2392         info = alloc_devinfo_mem();
2393         if (!info)
2394                 return NULL;
2395
2396         info->bus = bus;
2397         info->devfn = devfn;
2398         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2399         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2400         info->ats_qdep = 0;
2401         info->dev = dev;
2402         info->domain = domain;
2403         info->iommu = iommu;
2404
2405         if (dev && dev_is_pci(dev)) {
2406                 struct pci_dev *pdev = to_pci_dev(info->dev);
2407
2408                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2409                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2410                     dmar_find_matched_atsr_unit(pdev))
2411                         info->ats_supported = 1;
2412
2413                 if (ecs_enabled(iommu)) {
2414                         if (pasid_enabled(iommu)) {
2415                                 int features = pci_pasid_features(pdev);
2416                                 if (features >= 0)
2417                                         info->pasid_supported = features | 1;
2418                         }
2419
2420                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2421                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2422                                 info->pri_supported = 1;
2423                 }
2424         }
2425
2426         spin_lock_irqsave(&device_domain_lock, flags);
2427         if (dev)
2428                 found = find_domain(dev);
2429
2430         if (!found) {
2431                 struct device_domain_info *info2;
2432                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2433                 if (info2) {
2434                         found      = info2->domain;
2435                         info2->dev = dev;
2436                 }
2437         }
2438
2439         if (found) {
2440                 spin_unlock_irqrestore(&device_domain_lock, flags);
2441                 free_devinfo_mem(info);
2442                 /* Caller must free the original domain */
2443                 return found;
2444         }
2445
2446         spin_lock(&iommu->lock);
2447         ret = domain_attach_iommu(domain, iommu);
2448         spin_unlock(&iommu->lock);
2449
2450         if (ret) {
2451                 spin_unlock_irqrestore(&device_domain_lock, flags);
2452                 free_devinfo_mem(info);
2453                 return NULL;
2454         }
2455
2456         list_add(&info->link, &domain->devices);
2457         list_add(&info->global, &device_domain_list);
2458         if (dev)
2459                 dev->archdata.iommu = info;
2460         spin_unlock_irqrestore(&device_domain_lock, flags);
2461
2462         if (dev && domain_context_mapping(domain, dev)) {
2463                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2464                 dmar_remove_one_dev_info(domain, dev);
2465                 return NULL;
2466         }
2467
2468         return domain;
2469 }
2470
2471 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2472 {
2473         *(u16 *)opaque = alias;
2474         return 0;
2475 }
2476
2477 /* domain is initialized */
2478 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2479 {
2480         struct device_domain_info *info = NULL;
2481         struct dmar_domain *domain, *tmp;
2482         struct intel_iommu *iommu;
2483         u16 req_id, dma_alias;
2484         unsigned long flags;
2485         u8 bus, devfn;
2486
2487         domain = find_domain(dev);
2488         if (domain)
2489                 return domain;
2490
2491         iommu = device_to_iommu(dev, &bus, &devfn);
2492         if (!iommu)
2493                 return NULL;
2494
2495         req_id = ((u16)bus << 8) | devfn;
2496
2497         if (dev_is_pci(dev)) {
2498                 struct pci_dev *pdev = to_pci_dev(dev);
2499
2500                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2501
2502                 spin_lock_irqsave(&device_domain_lock, flags);
2503                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2504                                                       PCI_BUS_NUM(dma_alias),
2505                                                       dma_alias & 0xff);
2506                 if (info) {
2507                         iommu = info->iommu;
2508                         domain = info->domain;
2509                 }
2510                 spin_unlock_irqrestore(&device_domain_lock, flags);
2511
2512                 /* DMA alias already has a domain, uses it */
2513                 if (info)
2514                         goto found_domain;
2515         }
2516
2517         /* Allocate and initialize new domain for the device */
2518         domain = alloc_domain(0);
2519         if (!domain)
2520                 return NULL;
2521         if (domain_init(domain, iommu, gaw)) {
2522                 domain_exit(domain);
2523                 return NULL;
2524         }
2525
2526         /* register PCI DMA alias device */
2527         if (req_id != dma_alias && dev_is_pci(dev)) {
2528                 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2529                                                dma_alias & 0xff, NULL, domain);
2530
2531                 if (!tmp || tmp != domain) {
2532                         domain_exit(domain);
2533                         domain = tmp;
2534                 }
2535
2536                 if (!domain)
2537                         return NULL;
2538         }
2539
2540 found_domain:
2541         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2542
2543         if (!tmp || tmp != domain) {
2544                 domain_exit(domain);
2545                 domain = tmp;
2546         }
2547
2548         return domain;
2549 }
2550
2551 static int iommu_domain_identity_map(struct dmar_domain *domain,
2552                                      unsigned long long start,
2553                                      unsigned long long end)
2554 {
2555         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2556         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2557
2558         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2559                           dma_to_mm_pfn(last_vpfn))) {
2560                 pr_err("Reserving iova failed\n");
2561                 return -ENOMEM;
2562         }
2563
2564         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2565         /*
2566          * RMRR range might have overlap with physical memory range,
2567          * clear it first
2568          */
2569         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2570
2571         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2572                                   last_vpfn - first_vpfn + 1,
2573                                   DMA_PTE_READ|DMA_PTE_WRITE);
2574 }
2575
2576 static int domain_prepare_identity_map(struct device *dev,
2577                                        struct dmar_domain *domain,
2578                                        unsigned long long start,
2579                                        unsigned long long end)
2580 {
2581         /* For _hardware_ passthrough, don't bother. But for software
2582            passthrough, we do it anyway -- it may indicate a memory
2583            range which is reserved in E820, so which didn't get set
2584            up to start with in si_domain */
2585         if (domain == si_domain && hw_pass_through) {
2586                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2587                         dev_name(dev), start, end);
2588                 return 0;
2589         }
2590
2591         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2592                 dev_name(dev), start, end);
2593
2594         if (end < start) {
2595                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2596                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2597                         dmi_get_system_info(DMI_BIOS_VENDOR),
2598                         dmi_get_system_info(DMI_BIOS_VERSION),
2599                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2600                 return -EIO;
2601         }
2602
2603         if (end >> agaw_to_width(domain->agaw)) {
2604                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2605                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2606                      agaw_to_width(domain->agaw),
2607                      dmi_get_system_info(DMI_BIOS_VENDOR),
2608                      dmi_get_system_info(DMI_BIOS_VERSION),
2609                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2610                 return -EIO;
2611         }
2612
2613         return iommu_domain_identity_map(domain, start, end);
2614 }
2615
2616 static int iommu_prepare_identity_map(struct device *dev,
2617                                       unsigned long long start,
2618                                       unsigned long long end)
2619 {
2620         struct dmar_domain *domain;
2621         int ret;
2622
2623         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2624         if (!domain)
2625                 return -ENOMEM;
2626
2627         ret = domain_prepare_identity_map(dev, domain, start, end);
2628         if (ret)
2629                 domain_exit(domain);
2630
2631         return ret;
2632 }
2633
2634 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2635                                          struct device *dev)
2636 {
2637         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2638                 return 0;
2639         return iommu_prepare_identity_map(dev, rmrr->base_address,
2640                                           rmrr->end_address);
2641 }
2642
2643 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2644 static inline void iommu_prepare_isa(void)
2645 {
2646         struct pci_dev *pdev;
2647         int ret;
2648
2649         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2650         if (!pdev)
2651                 return;
2652
2653         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2654         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2655
2656         if (ret)
2657                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2658
2659         pci_dev_put(pdev);
2660 }
2661 #else
2662 static inline void iommu_prepare_isa(void)
2663 {
2664         return;
2665 }
2666 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2667
2668 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2669
2670 static int __init si_domain_init(int hw)
2671 {
2672         int nid, ret = 0;
2673
2674         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2675         if (!si_domain)
2676                 return -EFAULT;
2677
2678         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2679                 domain_exit(si_domain);
2680                 return -EFAULT;
2681         }
2682
2683         pr_debug("Identity mapping domain allocated\n");
2684
2685         if (hw)
2686                 return 0;
2687
2688         for_each_online_node(nid) {
2689                 unsigned long start_pfn, end_pfn;
2690                 int i;
2691
2692                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2693                         ret = iommu_domain_identity_map(si_domain,
2694                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2695                         if (ret)
2696                                 return ret;
2697                 }
2698         }
2699
2700         return 0;
2701 }
2702
2703 static int identity_mapping(struct device *dev)
2704 {
2705         struct device_domain_info *info;
2706
2707         if (likely(!iommu_identity_mapping))
2708                 return 0;
2709
2710         info = dev->archdata.iommu;
2711         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2712                 return (info->domain == si_domain);
2713
2714         return 0;
2715 }
2716
2717 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2718 {
2719         struct dmar_domain *ndomain;
2720         struct intel_iommu *iommu;
2721         u8 bus, devfn;
2722
2723         iommu = device_to_iommu(dev, &bus, &devfn);
2724         if (!iommu)
2725                 return -ENODEV;
2726
2727         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2728         if (ndomain != domain)
2729                 return -EBUSY;
2730
2731         return 0;
2732 }
2733
2734 static bool device_has_rmrr(struct device *dev)
2735 {
2736         struct dmar_rmrr_unit *rmrr;
2737         struct device *tmp;
2738         int i;
2739
2740         rcu_read_lock();
2741         for_each_rmrr_units(rmrr) {
2742                 /*
2743                  * Return TRUE if this RMRR contains the device that
2744                  * is passed in.
2745                  */
2746                 for_each_active_dev_scope(rmrr->devices,
2747                                           rmrr->devices_cnt, i, tmp)
2748                         if (tmp == dev) {
2749                                 rcu_read_unlock();
2750                                 return true;
2751                         }
2752         }
2753         rcu_read_unlock();
2754         return false;
2755 }
2756
2757 /*
2758  * There are a couple cases where we need to restrict the functionality of
2759  * devices associated with RMRRs.  The first is when evaluating a device for
2760  * identity mapping because problems exist when devices are moved in and out
2761  * of domains and their respective RMRR information is lost.  This means that
2762  * a device with associated RMRRs will never be in a "passthrough" domain.
2763  * The second is use of the device through the IOMMU API.  This interface
2764  * expects to have full control of the IOVA space for the device.  We cannot
2765  * satisfy both the requirement that RMRR access is maintained and have an
2766  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2767  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2768  * We therefore prevent devices associated with an RMRR from participating in
2769  * the IOMMU API, which eliminates them from device assignment.
2770  *
2771  * In both cases we assume that PCI USB devices with RMRRs have them largely
2772  * for historical reasons and that the RMRR space is not actively used post
2773  * boot.  This exclusion may change if vendors begin to abuse it.
2774  *
2775  * The same exception is made for graphics devices, with the requirement that
2776  * any use of the RMRR regions will be torn down before assigning the device
2777  * to a guest.
2778  */
2779 static bool device_is_rmrr_locked(struct device *dev)
2780 {
2781         if (!device_has_rmrr(dev))
2782                 return false;
2783
2784         if (dev_is_pci(dev)) {
2785                 struct pci_dev *pdev = to_pci_dev(dev);
2786
2787                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2788                         return false;
2789         }
2790
2791         return true;
2792 }
2793
2794 static int iommu_should_identity_map(struct device *dev, int startup)
2795 {
2796
2797         if (dev_is_pci(dev)) {
2798                 struct pci_dev *pdev = to_pci_dev(dev);
2799
2800                 if (device_is_rmrr_locked(dev))
2801                         return 0;
2802
2803                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2804                         return 1;
2805
2806                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2807                         return 1;
2808
2809                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2810                         return 0;
2811
2812                 /*
2813                  * We want to start off with all devices in the 1:1 domain, and
2814                  * take them out later if we find they can't access all of memory.
2815                  *
2816                  * However, we can't do this for PCI devices behind bridges,
2817                  * because all PCI devices behind the same bridge will end up
2818                  * with the same source-id on their transactions.
2819                  *
2820                  * Practically speaking, we can't change things around for these
2821                  * devices at run-time, because we can't be sure there'll be no
2822                  * DMA transactions in flight for any of their siblings.
2823                  *
2824                  * So PCI devices (unless they're on the root bus) as well as
2825                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2826                  * the 1:1 domain, just in _case_ one of their siblings turns out
2827                  * not to be able to map all of memory.
2828                  */
2829                 if (!pci_is_pcie(pdev)) {
2830                         if (!pci_is_root_bus(pdev->bus))
2831                                 return 0;
2832                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2833                                 return 0;
2834                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2835                         return 0;
2836         } else {
2837                 if (device_has_rmrr(dev))
2838                         return 0;
2839         }
2840
2841         /*
2842          * At boot time, we don't yet know if devices will be 64-bit capable.
2843          * Assume that they will — if they turn out not to be, then we can
2844          * take them out of the 1:1 domain later.
2845          */
2846         if (!startup) {
2847                 /*
2848                  * If the device's dma_mask is less than the system's memory
2849                  * size then this is not a candidate for identity mapping.
2850                  */
2851                 u64 dma_mask = *dev->dma_mask;
2852
2853                 if (dev->coherent_dma_mask &&
2854                     dev->coherent_dma_mask < dma_mask)
2855                         dma_mask = dev->coherent_dma_mask;
2856
2857                 return dma_mask >= dma_get_required_mask(dev);
2858         }
2859
2860         return 1;
2861 }
2862
2863 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2864 {
2865         int ret;
2866
2867         if (!iommu_should_identity_map(dev, 1))
2868                 return 0;
2869
2870         ret = domain_add_dev_info(si_domain, dev);
2871         if (!ret)
2872                 pr_info("%s identity mapping for device %s\n",
2873                         hw ? "Hardware" : "Software", dev_name(dev));
2874         else if (ret == -ENODEV)
2875                 /* device not associated with an iommu */
2876                 ret = 0;
2877
2878         return ret;
2879 }
2880
2881
2882 static int __init iommu_prepare_static_identity_mapping(int hw)
2883 {
2884         struct pci_dev *pdev = NULL;
2885         struct dmar_drhd_unit *drhd;
2886         struct intel_iommu *iommu;
2887         struct device *dev;
2888         int i;
2889         int ret = 0;
2890
2891         for_each_pci_dev(pdev) {
2892                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2893                 if (ret)
2894                         return ret;
2895         }
2896
2897         for_each_active_iommu(iommu, drhd)
2898                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2899                         struct acpi_device_physical_node *pn;
2900                         struct acpi_device *adev;
2901
2902                         if (dev->bus != &acpi_bus_type)
2903                                 continue;
2904
2905                         adev= to_acpi_device(dev);
2906                         mutex_lock(&adev->physical_node_lock);
2907                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2908                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2909                                 if (ret)
2910                                         break;
2911                         }
2912                         mutex_unlock(&adev->physical_node_lock);
2913                         if (ret)
2914                                 return ret;
2915                 }
2916
2917         return 0;
2918 }
2919
2920 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2921 {
2922         /*
2923          * Start from the sane iommu hardware state.
2924          * If the queued invalidation is already initialized by us
2925          * (for example, while enabling interrupt-remapping) then
2926          * we got the things already rolling from a sane state.
2927          */
2928         if (!iommu->qi) {
2929                 /*
2930                  * Clear any previous faults.
2931                  */
2932                 dmar_fault(-1, iommu);
2933                 /*
2934                  * Disable queued invalidation if supported and already enabled
2935                  * before OS handover.
2936                  */
2937                 dmar_disable_qi(iommu);
2938         }
2939
2940         if (dmar_enable_qi(iommu)) {
2941                 /*
2942                  * Queued Invalidate not enabled, use Register Based Invalidate
2943                  */
2944                 iommu->flush.flush_context = __iommu_flush_context;
2945                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2946                 pr_info("%s: Using Register based invalidation\n",
2947                         iommu->name);
2948         } else {
2949                 iommu->flush.flush_context = qi_flush_context;
2950                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2951                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2952         }
2953 }
2954
2955 static int copy_context_table(struct intel_iommu *iommu,
2956                               struct root_entry *old_re,
2957                               struct context_entry **tbl,
2958                               int bus, bool ext)
2959 {
2960         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2961         struct context_entry *new_ce = NULL, ce;
2962         struct context_entry *old_ce = NULL;
2963         struct root_entry re;
2964         phys_addr_t old_ce_phys;
2965
2966         tbl_idx = ext ? bus * 2 : bus;
2967         memcpy(&re, old_re, sizeof(re));
2968
2969         for (devfn = 0; devfn < 256; devfn++) {
2970                 /* First calculate the correct index */
2971                 idx = (ext ? devfn * 2 : devfn) % 256;
2972
2973                 if (idx == 0) {
2974                         /* First save what we may have and clean up */
2975                         if (new_ce) {
2976                                 tbl[tbl_idx] = new_ce;
2977                                 __iommu_flush_cache(iommu, new_ce,
2978                                                     VTD_PAGE_SIZE);
2979                                 pos = 1;
2980                         }
2981
2982                         if (old_ce)
2983                                 memunmap(old_ce);
2984
2985                         ret = 0;
2986                         if (devfn < 0x80)
2987                                 old_ce_phys = root_entry_lctp(&re);
2988                         else
2989                                 old_ce_phys = root_entry_uctp(&re);
2990
2991                         if (!old_ce_phys) {
2992                                 if (ext && devfn == 0) {
2993                                         /* No LCTP, try UCTP */
2994                                         devfn = 0x7f;
2995                                         continue;
2996                                 } else {
2997                                         goto out;
2998                                 }
2999                         }
3000
3001                         ret = -ENOMEM;
3002                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3003                                         MEMREMAP_WB);
3004                         if (!old_ce)
3005                                 goto out;
3006
3007                         new_ce = alloc_pgtable_page(iommu->node);
3008                         if (!new_ce)
3009                                 goto out_unmap;
3010
3011                         ret = 0;
3012                 }
3013
3014                 /* Now copy the context entry */
3015                 memcpy(&ce, old_ce + idx, sizeof(ce));
3016
3017                 if (!__context_present(&ce))
3018                         continue;
3019
3020                 did = context_domain_id(&ce);
3021                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3022                         set_bit(did, iommu->domain_ids);
3023
3024                 /*
3025                  * We need a marker for copied context entries. This
3026                  * marker needs to work for the old format as well as
3027                  * for extended context entries.
3028                  *
3029                  * Bit 67 of the context entry is used. In the old
3030                  * format this bit is available to software, in the
3031                  * extended format it is the PGE bit, but PGE is ignored
3032                  * by HW if PASIDs are disabled (and thus still
3033                  * available).
3034                  *
3035                  * So disable PASIDs first and then mark the entry
3036                  * copied. This means that we don't copy PASID
3037                  * translations from the old kernel, but this is fine as
3038                  * faults there are not fatal.
3039                  */
3040                 context_clear_pasid_enable(&ce);
3041                 context_set_copied(&ce);
3042
3043                 new_ce[idx] = ce;
3044         }
3045
3046         tbl[tbl_idx + pos] = new_ce;
3047
3048         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3049
3050 out_unmap:
3051         memunmap(old_ce);
3052
3053 out:
3054         return ret;
3055 }
3056
3057 static int copy_translation_tables(struct intel_iommu *iommu)
3058 {
3059         struct context_entry **ctxt_tbls;
3060         struct root_entry *old_rt;
3061         phys_addr_t old_rt_phys;
3062         int ctxt_table_entries;
3063         unsigned long flags;
3064         u64 rtaddr_reg;
3065         int bus, ret;
3066         bool new_ext, ext;
3067
3068         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3069         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3070         new_ext    = !!ecap_ecs(iommu->ecap);
3071
3072         /*
3073          * The RTT bit can only be changed when translation is disabled,
3074          * but disabling translation means to open a window for data
3075          * corruption. So bail out and don't copy anything if we would
3076          * have to change the bit.
3077          */
3078         if (new_ext != ext)
3079                 return -EINVAL;
3080
3081         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3082         if (!old_rt_phys)
3083                 return -EINVAL;
3084
3085         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3086         if (!old_rt)
3087                 return -ENOMEM;
3088
3089         /* This is too big for the stack - allocate it from slab */
3090         ctxt_table_entries = ext ? 512 : 256;
3091         ret = -ENOMEM;
3092         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3093         if (!ctxt_tbls)
3094                 goto out_unmap;
3095
3096         for (bus = 0; bus < 256; bus++) {
3097                 ret = copy_context_table(iommu, &old_rt[bus],
3098                                          ctxt_tbls, bus, ext);
3099                 if (ret) {
3100                         pr_err("%s: Failed to copy context table for bus %d\n",
3101                                 iommu->name, bus);
3102                         continue;
3103                 }
3104         }
3105
3106         spin_lock_irqsave(&iommu->lock, flags);
3107
3108         /* Context tables are copied, now write them to the root_entry table */
3109         for (bus = 0; bus < 256; bus++) {
3110                 int idx = ext ? bus * 2 : bus;
3111                 u64 val;
3112
3113                 if (ctxt_tbls[idx]) {
3114                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3115                         iommu->root_entry[bus].lo = val;
3116                 }
3117
3118                 if (!ext || !ctxt_tbls[idx + 1])
3119                         continue;
3120
3121                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3122                 iommu->root_entry[bus].hi = val;
3123         }
3124
3125         spin_unlock_irqrestore(&iommu->lock, flags);
3126
3127         kfree(ctxt_tbls);
3128
3129         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3130
3131         ret = 0;
3132
3133 out_unmap:
3134         memunmap(old_rt);
3135
3136         return ret;
3137 }
3138
3139 static int __init init_dmars(void)
3140 {
3141         struct dmar_drhd_unit *drhd;
3142         struct dmar_rmrr_unit *rmrr;
3143         bool copied_tables = false;
3144         struct device *dev;
3145         struct intel_iommu *iommu;
3146         int i, ret;
3147
3148         /*
3149          * for each drhd
3150          *    allocate root
3151          *    initialize and program root entry to not present
3152          * endfor
3153          */
3154         for_each_drhd_unit(drhd) {
3155                 /*
3156                  * lock not needed as this is only incremented in the single
3157                  * threaded kernel __init code path all other access are read
3158                  * only
3159                  */
3160                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3161                         g_num_of_iommus++;
3162                         continue;
3163                 }
3164                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3165         }
3166
3167         /* Preallocate enough resources for IOMMU hot-addition */
3168         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3169                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3170
3171         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3172                         GFP_KERNEL);
3173         if (!g_iommus) {
3174                 pr_err("Allocating global iommu array failed\n");
3175                 ret = -ENOMEM;
3176                 goto error;
3177         }
3178
3179         deferred_flush = kzalloc(g_num_of_iommus *
3180                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
3181         if (!deferred_flush) {
3182                 ret = -ENOMEM;
3183                 goto free_g_iommus;
3184         }
3185
3186         for_each_active_iommu(iommu, drhd) {
3187                 g_iommus[iommu->seq_id] = iommu;
3188
3189                 intel_iommu_init_qi(iommu);
3190
3191                 ret = iommu_init_domains(iommu);
3192                 if (ret)
3193                         goto free_iommu;
3194
3195                 init_translation_status(iommu);
3196
3197                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3198                         iommu_disable_translation(iommu);
3199                         clear_translation_pre_enabled(iommu);
3200                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3201                                 iommu->name);
3202                 }
3203
3204                 /*
3205                  * TBD:
3206                  * we could share the same root & context tables
3207                  * among all IOMMU's. Need to Split it later.
3208                  */
3209                 ret = iommu_alloc_root_entry(iommu);
3210                 if (ret)
3211                         goto free_iommu;
3212
3213                 if (translation_pre_enabled(iommu)) {
3214                         pr_info("Translation already enabled - trying to copy translation structures\n");
3215
3216                         ret = copy_translation_tables(iommu);
3217                         if (ret) {
3218                                 /*
3219                                  * We found the IOMMU with translation
3220                                  * enabled - but failed to copy over the
3221                                  * old root-entry table. Try to proceed
3222                                  * by disabling translation now and
3223                                  * allocating a clean root-entry table.
3224                                  * This might cause DMAR faults, but
3225                                  * probably the dump will still succeed.
3226                                  */
3227                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3228                                        iommu->name);
3229                                 iommu_disable_translation(iommu);
3230                                 clear_translation_pre_enabled(iommu);
3231                         } else {
3232                                 pr_info("Copied translation tables from previous kernel for %s\n",
3233                                         iommu->name);
3234                                 copied_tables = true;
3235                         }
3236                 }
3237
3238                 if (!ecap_pass_through(iommu->ecap))
3239                         hw_pass_through = 0;
3240 #ifdef CONFIG_INTEL_IOMMU_SVM
3241                 if (pasid_enabled(iommu))
3242                         intel_svm_alloc_pasid_tables(iommu);
3243 #endif
3244         }
3245
3246         /*
3247          * Now that qi is enabled on all iommus, set the root entry and flush
3248          * caches. This is required on some Intel X58 chipsets, otherwise the
3249          * flush_context function will loop forever and the boot hangs.
3250          */
3251         for_each_active_iommu(iommu, drhd) {
3252                 iommu_flush_write_buffer(iommu);
3253                 iommu_set_root_entry(iommu);
3254                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3255                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3256         }
3257
3258         if (iommu_pass_through)
3259                 iommu_identity_mapping |= IDENTMAP_ALL;
3260
3261 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3262         dmar_map_gfx = 0;
3263 #endif
3264
3265         if (!dmar_map_gfx)
3266                 iommu_identity_mapping |= IDENTMAP_GFX;
3267
3268         check_tylersburg_isoch();
3269
3270         if (iommu_identity_mapping) {
3271                 ret = si_domain_init(hw_pass_through);
3272                 if (ret)
3273                         goto free_iommu;
3274         }
3275
3276
3277         /*
3278          * If we copied translations from a previous kernel in the kdump
3279          * case, we can not assign the devices to domains now, as that
3280          * would eliminate the old mappings. So skip this part and defer
3281          * the assignment to device driver initialization time.
3282          */
3283         if (copied_tables)
3284                 goto domains_done;
3285
3286         /*
3287          * If pass through is not set or not enabled, setup context entries for
3288          * identity mappings for rmrr, gfx, and isa and may fall back to static
3289          * identity mapping if iommu_identity_mapping is set.
3290          */
3291         if (iommu_identity_mapping) {
3292                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3293                 if (ret) {
3294                         pr_crit("Failed to setup IOMMU pass-through\n");
3295                         goto free_iommu;
3296                 }
3297         }
3298         /*
3299          * For each rmrr
3300          *   for each dev attached to rmrr
3301          *   do
3302          *     locate drhd for dev, alloc domain for dev
3303          *     allocate free domain
3304          *     allocate page table entries for rmrr
3305          *     if context not allocated for bus
3306          *           allocate and init context
3307          *           set present in root table for this bus
3308          *     init context with domain, translation etc
3309          *    endfor
3310          * endfor
3311          */
3312         pr_info("Setting RMRR:\n");
3313         for_each_rmrr_units(rmrr) {
3314                 /* some BIOS lists non-exist devices in DMAR table. */
3315                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3316                                           i, dev) {
3317                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3318                         if (ret)
3319                                 pr_err("Mapping reserved region failed\n");
3320                 }
3321         }
3322
3323         iommu_prepare_isa();
3324
3325 domains_done:
3326
3327         /*
3328          * for each drhd
3329          *   enable fault log
3330          *   global invalidate context cache
3331          *   global invalidate iotlb
3332          *   enable translation
3333          */
3334         for_each_iommu(iommu, drhd) {
3335                 if (drhd->ignored) {
3336                         /*
3337                          * we always have to disable PMRs or DMA may fail on
3338                          * this device
3339                          */
3340                         if (force_on)
3341                                 iommu_disable_protect_mem_regions(iommu);
3342                         continue;
3343                 }
3344
3345                 iommu_flush_write_buffer(iommu);
3346
3347 #ifdef CONFIG_INTEL_IOMMU_SVM
3348                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3349                         ret = intel_svm_enable_prq(iommu);
3350                         if (ret)
3351                                 goto free_iommu;
3352                 }
3353 #endif
3354                 ret = dmar_set_interrupt(iommu);
3355                 if (ret)
3356                         goto free_iommu;
3357
3358                 if (!translation_pre_enabled(iommu))
3359                         iommu_enable_translation(iommu);
3360
3361                 iommu_disable_protect_mem_regions(iommu);
3362         }
3363
3364         return 0;
3365
3366 free_iommu:
3367         for_each_active_iommu(iommu, drhd) {
3368                 disable_dmar_iommu(iommu);
3369                 free_dmar_iommu(iommu);
3370         }
3371         kfree(deferred_flush);
3372 free_g_iommus:
3373         kfree(g_iommus);
3374 error:
3375         return ret;
3376 }
3377
3378 /* This takes a number of _MM_ pages, not VTD pages */
3379 static struct iova *intel_alloc_iova(struct device *dev,
3380                                      struct dmar_domain *domain,
3381                                      unsigned long nrpages, uint64_t dma_mask)
3382 {
3383         struct iova *iova = NULL;
3384
3385         /* Restrict dma_mask to the width that the iommu can handle */
3386         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3387         /* Ensure we reserve the whole size-aligned region */
3388         nrpages = __roundup_pow_of_two(nrpages);
3389
3390         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3391                 /*
3392                  * First try to allocate an io virtual address in
3393                  * DMA_BIT_MASK(32) and if that fails then try allocating
3394                  * from higher range
3395                  */
3396                 iova = alloc_iova(&domain->iovad, nrpages,
3397                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
3398                 if (iova)
3399                         return iova;
3400         }
3401         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3402         if (unlikely(!iova)) {
3403                 pr_err("Allocating %ld-page iova for %s failed",
3404                        nrpages, dev_name(dev));
3405                 return NULL;
3406         }
3407
3408         return iova;
3409 }
3410
3411 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3412 {
3413         struct dmar_rmrr_unit *rmrr;
3414         struct dmar_domain *domain;
3415         struct device *i_dev;
3416         int i, ret;
3417
3418         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3419         if (!domain) {
3420                 pr_err("Allocating domain for %s failed\n",
3421                        dev_name(dev));
3422                 return NULL;
3423         }
3424
3425         /* We have a new domain - setup possible RMRRs for the device */
3426         rcu_read_lock();
3427         for_each_rmrr_units(rmrr) {
3428                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3429                                           i, i_dev) {
3430                         if (i_dev != dev)
3431                                 continue;
3432
3433                         ret = domain_prepare_identity_map(dev, domain,
3434                                                           rmrr->base_address,
3435                                                           rmrr->end_address);
3436                         if (ret)
3437                                 dev_err(dev, "Mapping reserved region failed\n");
3438                 }
3439         }
3440         rcu_read_unlock();
3441
3442         return domain;
3443 }
3444
3445 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3446 {
3447         struct device_domain_info *info;
3448
3449         /* No lock here, assumes no domain exit in normal case */
3450         info = dev->archdata.iommu;
3451         if (likely(info))
3452                 return info->domain;
3453
3454         return __get_valid_domain_for_dev(dev);
3455 }
3456
3457 /* Check if the dev needs to go through non-identity map and unmap process.*/
3458 static int iommu_no_mapping(struct device *dev)
3459 {
3460         int found;
3461
3462         if (iommu_dummy(dev))
3463                 return 1;
3464
3465         if (!iommu_identity_mapping)
3466                 return 0;
3467
3468         found = identity_mapping(dev);
3469         if (found) {
3470                 if (iommu_should_identity_map(dev, 0))
3471                         return 1;
3472                 else {
3473                         /*
3474                          * 32 bit DMA is removed from si_domain and fall back
3475                          * to non-identity mapping.
3476                          */
3477                         dmar_remove_one_dev_info(si_domain, dev);
3478                         pr_info("32bit %s uses non-identity mapping\n",
3479                                 dev_name(dev));
3480                         return 0;
3481                 }
3482         } else {
3483                 /*
3484                  * In case of a detached 64 bit DMA device from vm, the device
3485                  * is put into si_domain for identity mapping.
3486                  */
3487                 if (iommu_should_identity_map(dev, 0)) {
3488                         int ret;
3489                         ret = domain_add_dev_info(si_domain, dev);
3490                         if (!ret) {
3491                                 pr_info("64bit %s uses identity mapping\n",
3492                                         dev_name(dev));
3493                                 return 1;
3494                         }
3495                 }
3496         }
3497
3498         return 0;
3499 }
3500
3501 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3502                                      size_t size, int dir, u64 dma_mask)
3503 {
3504         struct dmar_domain *domain;
3505         phys_addr_t start_paddr;
3506         struct iova *iova;
3507         int prot = 0;
3508         int ret;
3509         struct intel_iommu *iommu;
3510         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3511
3512         BUG_ON(dir == DMA_NONE);
3513
3514         if (iommu_no_mapping(dev))
3515                 return paddr;
3516
3517         domain = get_valid_domain_for_dev(dev);
3518         if (!domain)
3519                 return 0;
3520
3521         iommu = domain_get_iommu(domain);
3522         size = aligned_nrpages(paddr, size);
3523
3524         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3525         if (!iova)
3526                 goto error;
3527
3528         /*
3529          * Check if DMAR supports zero-length reads on write only
3530          * mappings..
3531          */
3532         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3533                         !cap_zlr(iommu->cap))
3534                 prot |= DMA_PTE_READ;
3535         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3536                 prot |= DMA_PTE_WRITE;
3537         /*
3538          * paddr - (paddr + size) might be partial page, we should map the whole
3539          * page.  Note: if two part of one page are separately mapped, we
3540          * might have two guest_addr mapping to the same host paddr, but this
3541          * is not a big problem
3542          */
3543         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3544                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3545         if (ret)
3546                 goto error;
3547
3548         /* it's a non-present to present mapping. Only flush if caching mode */
3549         if (cap_caching_mode(iommu->cap))
3550                 iommu_flush_iotlb_psi(iommu, domain,
3551                                       mm_to_dma_pfn(iova->pfn_lo),
3552                                       size, 0, 1);
3553         else
3554                 iommu_flush_write_buffer(iommu);
3555
3556         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3557         start_paddr += paddr & ~PAGE_MASK;
3558         return start_paddr;
3559
3560 error:
3561         if (iova)
3562                 __free_iova(&domain->iovad, iova);
3563         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3564                 dev_name(dev), size, (unsigned long long)paddr, dir);
3565         return 0;
3566 }
3567
3568 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3569                                  unsigned long offset, size_t size,
3570                                  enum dma_data_direction dir,
3571                                  struct dma_attrs *attrs)
3572 {
3573         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3574                                   dir, *dev->dma_mask);
3575 }
3576
3577 static void flush_unmaps(void)
3578 {
3579         int i, j;
3580
3581         timer_on = 0;
3582
3583         /* just flush them all */
3584         for (i = 0; i < g_num_of_iommus; i++) {
3585                 struct intel_iommu *iommu = g_iommus[i];
3586                 if (!iommu)
3587                         continue;
3588
3589                 if (!deferred_flush[i].next)
3590                         continue;
3591
3592                 /* In caching mode, global flushes turn emulation expensive */
3593                 if (!cap_caching_mode(iommu->cap))
3594                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3595                                          DMA_TLB_GLOBAL_FLUSH);
3596                 for (j = 0; j < deferred_flush[i].next; j++) {
3597                         unsigned long mask;
3598                         struct iova *iova = deferred_flush[i].iova[j];
3599                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3600
3601                         /* On real hardware multiple invalidations are expensive */
3602                         if (cap_caching_mode(iommu->cap))
3603                                 iommu_flush_iotlb_psi(iommu, domain,
3604                                         iova->pfn_lo, iova_size(iova),
3605                                         !deferred_flush[i].freelist[j], 0);
3606                         else {
3607                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3608                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3609                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3610                         }
3611                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3612                         if (deferred_flush[i].freelist[j])
3613                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3614                 }
3615                 deferred_flush[i].next = 0;
3616         }
3617
3618         list_size = 0;
3619 }
3620
3621 static void flush_unmaps_timeout(unsigned long data)
3622 {
3623         unsigned long flags;
3624
3625         spin_lock_irqsave(&async_umap_flush_lock, flags);
3626         flush_unmaps();
3627         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3628 }
3629
3630 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3631 {
3632         unsigned long flags;
3633         int next, iommu_id;
3634         struct intel_iommu *iommu;
3635
3636         spin_lock_irqsave(&async_umap_flush_lock, flags);
3637         if (list_size == HIGH_WATER_MARK)
3638                 flush_unmaps();
3639
3640         iommu = domain_get_iommu(dom);
3641         iommu_id = iommu->seq_id;
3642
3643         next = deferred_flush[iommu_id].next;
3644         deferred_flush[iommu_id].domain[next] = dom;
3645         deferred_flush[iommu_id].iova[next] = iova;
3646         deferred_flush[iommu_id].freelist[next] = freelist;
3647         deferred_flush[iommu_id].next++;
3648
3649         if (!timer_on) {
3650                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3651                 timer_on = 1;
3652         }
3653         list_size++;
3654         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3655 }
3656
3657 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3658 {
3659         struct dmar_domain *domain;
3660         unsigned long start_pfn, last_pfn;
3661         struct iova *iova;
3662         struct intel_iommu *iommu;
3663         struct page *freelist;
3664
3665         if (iommu_no_mapping(dev))
3666                 return;
3667
3668         domain = find_domain(dev);
3669         BUG_ON(!domain);
3670
3671         iommu = domain_get_iommu(domain);
3672
3673         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3674         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3675                       (unsigned long long)dev_addr))
3676                 return;
3677
3678         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3679         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3680
3681         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3682                  dev_name(dev), start_pfn, last_pfn);
3683
3684         freelist = domain_unmap(domain, start_pfn, last_pfn);
3685
3686         if (intel_iommu_strict) {
3687                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3688                                       last_pfn - start_pfn + 1, !freelist, 0);
3689                 /* free iova */
3690                 __free_iova(&domain->iovad, iova);
3691                 dma_free_pagelist(freelist);
3692         } else {
3693                 add_unmap(domain, iova, freelist);
3694                 /*
3695                  * queue up the release of the unmap to save the 1/6th of the
3696                  * cpu used up by the iotlb flush operation...
3697                  */
3698         }
3699 }
3700
3701 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3702                              size_t size, enum dma_data_direction dir,
3703                              struct dma_attrs *attrs)
3704 {
3705         intel_unmap(dev, dev_addr);
3706 }
3707
3708 static void *intel_alloc_coherent(struct device *dev, size_t size,
3709                                   dma_addr_t *dma_handle, gfp_t flags,
3710                                   struct dma_attrs *attrs)
3711 {
3712         struct page *page = NULL;
3713         int order;
3714
3715         size = PAGE_ALIGN(size);
3716         order = get_order(size);
3717
3718         if (!iommu_no_mapping(dev))
3719                 flags &= ~(GFP_DMA | GFP_DMA32);
3720         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3721                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3722                         flags |= GFP_DMA;
3723                 else
3724                         flags |= GFP_DMA32;
3725         }
3726
3727         if (gfpflags_allow_blocking(flags)) {
3728                 unsigned int count = size >> PAGE_SHIFT;
3729
3730                 page = dma_alloc_from_contiguous(dev, count, order);
3731                 if (page && iommu_no_mapping(dev) &&
3732                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3733                         dma_release_from_contiguous(dev, page, count);
3734                         page = NULL;
3735                 }
3736         }
3737
3738         if (!page)
3739                 page = alloc_pages(flags, order);
3740         if (!page)
3741                 return NULL;
3742         memset(page_address(page), 0, size);
3743
3744         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3745                                          DMA_BIDIRECTIONAL,
3746                                          dev->coherent_dma_mask);
3747         if (*dma_handle)
3748                 return page_address(page);
3749         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3750                 __free_pages(page, order);
3751
3752         return NULL;
3753 }
3754
3755 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3756                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3757 {
3758         int order;
3759         struct page *page = virt_to_page(vaddr);
3760
3761         size = PAGE_ALIGN(size);
3762         order = get_order(size);
3763
3764         intel_unmap(dev, dma_handle);
3765         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3766                 __free_pages(page, order);
3767 }
3768
3769 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3770                            int nelems, enum dma_data_direction dir,
3771                            struct dma_attrs *attrs)
3772 {
3773         intel_unmap(dev, sglist[0].dma_address);
3774 }
3775
3776 static int intel_nontranslate_map_sg(struct device *hddev,
3777         struct scatterlist *sglist, int nelems, int dir)
3778 {
3779         int i;
3780         struct scatterlist *sg;
3781
3782         for_each_sg(sglist, sg, nelems, i) {
3783                 BUG_ON(!sg_page(sg));
3784                 sg->dma_address = sg_phys(sg);
3785                 sg->dma_length = sg->length;
3786         }
3787         return nelems;
3788 }
3789
3790 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3791                         enum dma_data_direction dir, struct dma_attrs *attrs)
3792 {
3793         int i;
3794         struct dmar_domain *domain;
3795         size_t size = 0;
3796         int prot = 0;
3797         struct iova *iova = NULL;
3798         int ret;
3799         struct scatterlist *sg;
3800         unsigned long start_vpfn;
3801         struct intel_iommu *iommu;
3802
3803         BUG_ON(dir == DMA_NONE);
3804         if (iommu_no_mapping(dev))
3805                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3806
3807         domain = get_valid_domain_for_dev(dev);
3808         if (!domain)
3809                 return 0;
3810
3811         iommu = domain_get_iommu(domain);
3812
3813         for_each_sg(sglist, sg, nelems, i)
3814                 size += aligned_nrpages(sg->offset, sg->length);
3815
3816         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3817                                 *dev->dma_mask);
3818         if (!iova) {
3819                 sglist->dma_length = 0;
3820                 return 0;
3821         }
3822
3823         /*
3824          * Check if DMAR supports zero-length reads on write only
3825          * mappings..
3826          */
3827         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3828                         !cap_zlr(iommu->cap))
3829                 prot |= DMA_PTE_READ;
3830         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3831                 prot |= DMA_PTE_WRITE;
3832
3833         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3834
3835         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3836         if (unlikely(ret)) {
3837                 dma_pte_free_pagetable(domain, start_vpfn,
3838                                        start_vpfn + size - 1);
3839                 __free_iova(&domain->iovad, iova);
3840                 return 0;
3841         }
3842
3843         /* it's a non-present to present mapping. Only flush if caching mode */
3844         if (cap_caching_mode(iommu->cap))
3845                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3846         else
3847                 iommu_flush_write_buffer(iommu);
3848
3849         return nelems;
3850 }
3851
3852 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3853 {
3854         return !dma_addr;
3855 }
3856
3857 struct dma_map_ops intel_dma_ops = {
3858         .alloc = intel_alloc_coherent,
3859         .free = intel_free_coherent,
3860         .map_sg = intel_map_sg,
3861         .unmap_sg = intel_unmap_sg,
3862         .map_page = intel_map_page,
3863         .unmap_page = intel_unmap_page,
3864         .mapping_error = intel_mapping_error,
3865 };
3866
3867 static inline int iommu_domain_cache_init(void)
3868 {
3869         int ret = 0;
3870
3871         iommu_domain_cache = kmem_cache_create("iommu_domain",
3872                                          sizeof(struct dmar_domain),
3873                                          0,
3874                                          SLAB_HWCACHE_ALIGN,
3875
3876                                          NULL);
3877         if (!iommu_domain_cache) {
3878                 pr_err("Couldn't create iommu_domain cache\n");
3879                 ret = -ENOMEM;
3880         }
3881
3882         return ret;
3883 }
3884
3885 static inline int iommu_devinfo_cache_init(void)
3886 {
3887         int ret = 0;
3888
3889         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3890                                          sizeof(struct device_domain_info),
3891                                          0,
3892                                          SLAB_HWCACHE_ALIGN,
3893                                          NULL);
3894         if (!iommu_devinfo_cache) {
3895                 pr_err("Couldn't create devinfo cache\n");
3896                 ret = -ENOMEM;
3897         }
3898
3899         return ret;
3900 }
3901
3902 static int __init iommu_init_mempool(void)
3903 {
3904         int ret;
3905         ret = iova_cache_get();
3906         if (ret)
3907                 return ret;
3908
3909         ret = iommu_domain_cache_init();
3910         if (ret)
3911                 goto domain_error;
3912
3913         ret = iommu_devinfo_cache_init();
3914         if (!ret)
3915                 return ret;
3916
3917         kmem_cache_destroy(iommu_domain_cache);
3918 domain_error:
3919         iova_cache_put();
3920
3921         return -ENOMEM;
3922 }
3923
3924 static void __init iommu_exit_mempool(void)
3925 {
3926         kmem_cache_destroy(iommu_devinfo_cache);
3927         kmem_cache_destroy(iommu_domain_cache);
3928         iova_cache_put();
3929 }
3930
3931 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3932 {
3933         struct dmar_drhd_unit *drhd;
3934         u32 vtbar;
3935         int rc;
3936
3937         /* We know that this device on this chipset has its own IOMMU.
3938          * If we find it under a different IOMMU, then the BIOS is lying
3939          * to us. Hope that the IOMMU for this device is actually
3940          * disabled, and it needs no translation...
3941          */
3942         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3943         if (rc) {
3944                 /* "can't" happen */
3945                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3946                 return;
3947         }
3948         vtbar &= 0xffff0000;
3949
3950         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3951         drhd = dmar_find_matched_drhd_unit(pdev);
3952         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3953                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
3954                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3955                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3956         }
3957 }
3958 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3959
3960 static void __init init_no_remapping_devices(void)
3961 {
3962         struct dmar_drhd_unit *drhd;
3963         struct device *dev;
3964         int i;
3965
3966         for_each_drhd_unit(drhd) {
3967                 if (!drhd->include_all) {
3968                         for_each_active_dev_scope(drhd->devices,
3969                                                   drhd->devices_cnt, i, dev)
3970                                 break;
3971                         /* ignore DMAR unit if no devices exist */
3972                         if (i == drhd->devices_cnt)
3973                                 drhd->ignored = 1;
3974                 }
3975         }
3976
3977         for_each_active_drhd_unit(drhd) {
3978                 if (drhd->include_all)
3979                         continue;
3980
3981                 for_each_active_dev_scope(drhd->devices,
3982                                           drhd->devices_cnt, i, dev)
3983                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3984                                 break;
3985                 if (i < drhd->devices_cnt)
3986                         continue;
3987
3988                 /* This IOMMU has *only* gfx devices. Either bypass it or
3989                    set the gfx_mapped flag, as appropriate */
3990                 if (!dmar_map_gfx) {
3991                         drhd->ignored = 1;
3992                         for_each_active_dev_scope(drhd->devices,
3993                                                   drhd->devices_cnt, i, dev)
3994                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3995                 }
3996         }
3997 }
3998
3999 #ifdef CONFIG_SUSPEND
4000 static int init_iommu_hw(void)
4001 {
4002         struct dmar_drhd_unit *drhd;
4003         struct intel_iommu *iommu = NULL;
4004
4005         for_each_active_iommu(iommu, drhd)
4006                 if (iommu->qi)
4007                         dmar_reenable_qi(iommu);
4008
4009         for_each_iommu(iommu, drhd) {
4010                 if (drhd->ignored) {
4011                         /*
4012                          * we always have to disable PMRs or DMA may fail on
4013                          * this device
4014                          */
4015                         if (force_on)
4016                                 iommu_disable_protect_mem_regions(iommu);
4017                         continue;
4018                 }
4019         
4020                 iommu_flush_write_buffer(iommu);
4021
4022                 iommu_set_root_entry(iommu);
4023
4024                 iommu->flush.flush_context(iommu, 0, 0, 0,
4025                                            DMA_CCMD_GLOBAL_INVL);
4026                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4027                 iommu_enable_translation(iommu);
4028                 iommu_disable_protect_mem_regions(iommu);
4029         }
4030
4031         return 0;
4032 }
4033
4034 static void iommu_flush_all(void)
4035 {
4036         struct dmar_drhd_unit *drhd;
4037         struct intel_iommu *iommu;
4038
4039         for_each_active_iommu(iommu, drhd) {
4040                 iommu->flush.flush_context(iommu, 0, 0, 0,
4041                                            DMA_CCMD_GLOBAL_INVL);
4042                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4043                                          DMA_TLB_GLOBAL_FLUSH);
4044         }
4045 }
4046
4047 static int iommu_suspend(void)
4048 {
4049         struct dmar_drhd_unit *drhd;
4050         struct intel_iommu *iommu = NULL;
4051         unsigned long flag;
4052
4053         for_each_active_iommu(iommu, drhd) {
4054                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4055                                                  GFP_ATOMIC);
4056                 if (!iommu->iommu_state)
4057                         goto nomem;
4058         }
4059
4060         iommu_flush_all();
4061
4062         for_each_active_iommu(iommu, drhd) {
4063                 iommu_disable_translation(iommu);
4064
4065                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4066
4067                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4068                         readl(iommu->reg + DMAR_FECTL_REG);
4069                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4070                         readl(iommu->reg + DMAR_FEDATA_REG);
4071                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4072                         readl(iommu->reg + DMAR_FEADDR_REG);
4073                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4074                         readl(iommu->reg + DMAR_FEUADDR_REG);
4075
4076                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4077         }
4078         return 0;
4079
4080 nomem:
4081         for_each_active_iommu(iommu, drhd)
4082                 kfree(iommu->iommu_state);
4083
4084         return -ENOMEM;
4085 }
4086
4087 static void iommu_resume(void)
4088 {
4089         struct dmar_drhd_unit *drhd;
4090         struct intel_iommu *iommu = NULL;
4091         unsigned long flag;
4092
4093         if (init_iommu_hw()) {
4094                 if (force_on)
4095                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4096                 else
4097                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4098                 return;
4099         }
4100
4101         for_each_active_iommu(iommu, drhd) {
4102
4103                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4104
4105                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4106                         iommu->reg + DMAR_FECTL_REG);
4107                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4108                         iommu->reg + DMAR_FEDATA_REG);
4109                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4110                         iommu->reg + DMAR_FEADDR_REG);
4111                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4112                         iommu->reg + DMAR_FEUADDR_REG);
4113
4114                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4115         }
4116
4117         for_each_active_iommu(iommu, drhd)
4118                 kfree(iommu->iommu_state);
4119 }
4120
4121 static struct syscore_ops iommu_syscore_ops = {
4122         .resume         = iommu_resume,
4123         .suspend        = iommu_suspend,
4124 };
4125
4126 static void __init init_iommu_pm_ops(void)
4127 {
4128         register_syscore_ops(&iommu_syscore_ops);
4129 }
4130
4131 #else
4132 static inline void init_iommu_pm_ops(void) {}
4133 #endif  /* CONFIG_PM */
4134
4135
4136 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4137 {
4138         struct acpi_dmar_reserved_memory *rmrr;
4139         struct dmar_rmrr_unit *rmrru;
4140
4141         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4142         if (!rmrru)
4143                 return -ENOMEM;
4144
4145         rmrru->hdr = header;
4146         rmrr = (struct acpi_dmar_reserved_memory *)header;
4147         rmrru->base_address = rmrr->base_address;
4148         rmrru->end_address = rmrr->end_address;
4149         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4150                                 ((void *)rmrr) + rmrr->header.length,
4151                                 &rmrru->devices_cnt);
4152         if (rmrru->devices_cnt && rmrru->devices == NULL) {
4153                 kfree(rmrru);
4154                 return -ENOMEM;
4155         }
4156
4157         list_add(&rmrru->list, &dmar_rmrr_units);
4158
4159         return 0;
4160 }
4161
4162 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4163 {
4164         struct dmar_atsr_unit *atsru;
4165         struct acpi_dmar_atsr *tmp;
4166
4167         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4168                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4169                 if (atsr->segment != tmp->segment)
4170                         continue;
4171                 if (atsr->header.length != tmp->header.length)
4172                         continue;
4173                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4174                         return atsru;
4175         }
4176
4177         return NULL;
4178 }
4179
4180 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4181 {
4182         struct acpi_dmar_atsr *atsr;
4183         struct dmar_atsr_unit *atsru;
4184
4185         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4186                 return 0;
4187
4188         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4189         atsru = dmar_find_atsr(atsr);
4190         if (atsru)
4191                 return 0;
4192
4193         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4194         if (!atsru)
4195                 return -ENOMEM;
4196
4197         /*
4198          * If memory is allocated from slab by ACPI _DSM method, we need to
4199          * copy the memory content because the memory buffer will be freed
4200          * on return.
4201          */
4202         atsru->hdr = (void *)(atsru + 1);
4203         memcpy(atsru->hdr, hdr, hdr->length);
4204         atsru->include_all = atsr->flags & 0x1;
4205         if (!atsru->include_all) {
4206                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4207                                 (void *)atsr + atsr->header.length,
4208                                 &atsru->devices_cnt);
4209                 if (atsru->devices_cnt && atsru->devices == NULL) {
4210                         kfree(atsru);
4211                         return -ENOMEM;
4212                 }
4213         }
4214
4215         list_add_rcu(&atsru->list, &dmar_atsr_units);
4216
4217         return 0;
4218 }
4219
4220 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4221 {
4222         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4223         kfree(atsru);
4224 }
4225
4226 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4227 {
4228         struct acpi_dmar_atsr *atsr;
4229         struct dmar_atsr_unit *atsru;
4230
4231         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4232         atsru = dmar_find_atsr(atsr);
4233         if (atsru) {
4234                 list_del_rcu(&atsru->list);
4235                 synchronize_rcu();
4236                 intel_iommu_free_atsr(atsru);
4237         }
4238
4239         return 0;
4240 }
4241
4242 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4243 {
4244         int i;
4245         struct device *dev;
4246         struct acpi_dmar_atsr *atsr;
4247         struct dmar_atsr_unit *atsru;
4248
4249         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4250         atsru = dmar_find_atsr(atsr);
4251         if (!atsru)
4252                 return 0;
4253
4254         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4255                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4256                                           i, dev)
4257                         return -EBUSY;
4258         }
4259
4260         return 0;
4261 }
4262
4263 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4264 {
4265         int sp, ret = 0;
4266         struct intel_iommu *iommu = dmaru->iommu;
4267
4268         if (g_iommus[iommu->seq_id])
4269                 return 0;
4270
4271         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4272                 pr_warn("%s: Doesn't support hardware pass through.\n",
4273                         iommu->name);
4274                 return -ENXIO;
4275         }
4276         if (!ecap_sc_support(iommu->ecap) &&
4277             domain_update_iommu_snooping(iommu)) {
4278                 pr_warn("%s: Doesn't support snooping.\n",
4279                         iommu->name);
4280                 return -ENXIO;
4281         }
4282         sp = domain_update_iommu_superpage(iommu) - 1;
4283         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4284                 pr_warn("%s: Doesn't support large page.\n",
4285                         iommu->name);
4286                 return -ENXIO;
4287         }
4288
4289         /*
4290          * Disable translation if already enabled prior to OS handover.
4291          */
4292         if (iommu->gcmd & DMA_GCMD_TE)
4293                 iommu_disable_translation(iommu);
4294
4295         g_iommus[iommu->seq_id] = iommu;
4296         ret = iommu_init_domains(iommu);
4297         if (ret == 0)
4298                 ret = iommu_alloc_root_entry(iommu);
4299         if (ret)
4300                 goto out;
4301
4302 #ifdef CONFIG_INTEL_IOMMU_SVM
4303         if (pasid_enabled(iommu))
4304                 intel_svm_alloc_pasid_tables(iommu);
4305 #endif
4306
4307         if (dmaru->ignored) {
4308                 /*
4309                  * we always have to disable PMRs or DMA may fail on this device
4310                  */
4311                 if (force_on)
4312                         iommu_disable_protect_mem_regions(iommu);
4313                 return 0;
4314         }
4315
4316         intel_iommu_init_qi(iommu);
4317         iommu_flush_write_buffer(iommu);
4318
4319 #ifdef CONFIG_INTEL_IOMMU_SVM
4320         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4321                 ret = intel_svm_enable_prq(iommu);
4322                 if (ret)
4323                         goto disable_iommu;
4324         }
4325 #endif
4326         ret = dmar_set_interrupt(iommu);
4327         if (ret)
4328                 goto disable_iommu;
4329
4330         iommu_set_root_entry(iommu);
4331         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4332         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4333         iommu_enable_translation(iommu);
4334
4335         iommu_disable_protect_mem_regions(iommu);
4336         return 0;
4337
4338 disable_iommu:
4339         disable_dmar_iommu(iommu);
4340 out:
4341         free_dmar_iommu(iommu);
4342         return ret;
4343 }
4344
4345 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4346 {
4347         int ret = 0;
4348         struct intel_iommu *iommu = dmaru->iommu;
4349
4350         if (!intel_iommu_enabled)
4351                 return 0;
4352         if (iommu == NULL)
4353                 return -EINVAL;
4354
4355         if (insert) {
4356                 ret = intel_iommu_add(dmaru);
4357         } else {
4358                 disable_dmar_iommu(iommu);
4359                 free_dmar_iommu(iommu);
4360         }
4361
4362         return ret;
4363 }
4364
4365 static void intel_iommu_free_dmars(void)
4366 {
4367         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4368         struct dmar_atsr_unit *atsru, *atsr_n;
4369
4370         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4371                 list_del(&rmrru->list);
4372                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4373                 kfree(rmrru);
4374         }
4375
4376         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4377                 list_del(&atsru->list);
4378                 intel_iommu_free_atsr(atsru);
4379         }
4380 }
4381
4382 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4383 {
4384         int i, ret = 1;
4385         struct pci_bus *bus;
4386         struct pci_dev *bridge = NULL;
4387         struct device *tmp;
4388         struct acpi_dmar_atsr *atsr;
4389         struct dmar_atsr_unit *atsru;
4390
4391         dev = pci_physfn(dev);
4392         for (bus = dev->bus; bus; bus = bus->parent) {
4393                 bridge = bus->self;
4394                 /* If it's an integrated device, allow ATS */
4395                 if (!bridge)
4396                         return 1;
4397                 /* Connected via non-PCIe: no ATS */
4398                 if (!pci_is_pcie(bridge) ||
4399                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4400                         return 0;
4401                 /* If we found the root port, look it up in the ATSR */
4402                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4403                         break;
4404         }
4405
4406         rcu_read_lock();
4407         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4408                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4409                 if (atsr->segment != pci_domain_nr(dev->bus))
4410                         continue;
4411
4412                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4413                         if (tmp == &bridge->dev)
4414                                 goto out;
4415
4416                 if (atsru->include_all)
4417                         goto out;
4418         }
4419         ret = 0;
4420 out:
4421         rcu_read_unlock();
4422
4423         return ret;
4424 }
4425
4426 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4427 {
4428         int ret = 0;
4429         struct dmar_rmrr_unit *rmrru;
4430         struct dmar_atsr_unit *atsru;
4431         struct acpi_dmar_atsr *atsr;
4432         struct acpi_dmar_reserved_memory *rmrr;
4433
4434         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4435                 return 0;
4436
4437         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4438                 rmrr = container_of(rmrru->hdr,
4439                                     struct acpi_dmar_reserved_memory, header);
4440                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4441                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4442                                 ((void *)rmrr) + rmrr->header.length,
4443                                 rmrr->segment, rmrru->devices,
4444                                 rmrru->devices_cnt);
4445                         if(ret < 0)
4446                                 return ret;
4447                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4448                         dmar_remove_dev_scope(info, rmrr->segment,
4449                                 rmrru->devices, rmrru->devices_cnt);
4450                 }
4451         }
4452
4453         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4454                 if (atsru->include_all)
4455                         continue;
4456
4457                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4458                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4459                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4460                                         (void *)atsr + atsr->header.length,
4461                                         atsr->segment, atsru->devices,
4462                                         atsru->devices_cnt);
4463                         if (ret > 0)
4464                                 break;
4465                         else if(ret < 0)
4466                                 return ret;
4467                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4468                         if (dmar_remove_dev_scope(info, atsr->segment,
4469                                         atsru->devices, atsru->devices_cnt))
4470                                 break;
4471                 }
4472         }
4473
4474         return 0;
4475 }
4476
4477 /*
4478  * Here we only respond to action of unbound device from driver.
4479  *
4480  * Added device is not attached to its DMAR domain here yet. That will happen
4481  * when mapping the device to iova.
4482  */
4483 static int device_notifier(struct notifier_block *nb,
4484                                   unsigned long action, void *data)
4485 {
4486         struct device *dev = data;
4487         struct dmar_domain *domain;
4488
4489         if (iommu_dummy(dev))
4490                 return 0;
4491
4492         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4493                 return 0;
4494
4495         domain = find_domain(dev);
4496         if (!domain)
4497                 return 0;
4498
4499         dmar_remove_one_dev_info(domain, dev);
4500         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4501                 domain_exit(domain);
4502
4503         return 0;
4504 }
4505
4506 static struct notifier_block device_nb = {
4507         .notifier_call = device_notifier,
4508 };
4509
4510 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4511                                        unsigned long val, void *v)
4512 {
4513         struct memory_notify *mhp = v;
4514         unsigned long long start, end;
4515         unsigned long start_vpfn, last_vpfn;
4516
4517         switch (val) {
4518         case MEM_GOING_ONLINE:
4519                 start = mhp->start_pfn << PAGE_SHIFT;
4520                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4521                 if (iommu_domain_identity_map(si_domain, start, end)) {
4522                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4523                                 start, end);
4524                         return NOTIFY_BAD;
4525                 }
4526                 break;
4527
4528         case MEM_OFFLINE:
4529         case MEM_CANCEL_ONLINE:
4530                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4531                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4532                 while (start_vpfn <= last_vpfn) {
4533                         struct iova *iova;
4534                         struct dmar_drhd_unit *drhd;
4535                         struct intel_iommu *iommu;
4536                         struct page *freelist;
4537
4538                         iova = find_iova(&si_domain->iovad, start_vpfn);
4539                         if (iova == NULL) {
4540                                 pr_debug("Failed get IOVA for PFN %lx\n",
4541                                          start_vpfn);
4542                                 break;
4543                         }
4544
4545                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4546                                                      start_vpfn, last_vpfn);
4547                         if (iova == NULL) {
4548                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4549                                         start_vpfn, last_vpfn);
4550                                 return NOTIFY_BAD;
4551                         }
4552
4553                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4554                                                iova->pfn_hi);
4555
4556                         rcu_read_lock();
4557                         for_each_active_iommu(iommu, drhd)
4558                                 iommu_flush_iotlb_psi(iommu, si_domain,
4559                                         iova->pfn_lo, iova_size(iova),
4560                                         !freelist, 0);
4561                         rcu_read_unlock();
4562                         dma_free_pagelist(freelist);
4563
4564                         start_vpfn = iova->pfn_hi + 1;
4565                         free_iova_mem(iova);
4566                 }
4567                 break;
4568         }
4569
4570         return NOTIFY_OK;
4571 }
4572
4573 static struct notifier_block intel_iommu_memory_nb = {
4574         .notifier_call = intel_iommu_memory_notifier,
4575         .priority = 0
4576 };
4577
4578
4579 static ssize_t intel_iommu_show_version(struct device *dev,
4580                                         struct device_attribute *attr,
4581                                         char *buf)
4582 {
4583         struct intel_iommu *iommu = dev_get_drvdata(dev);
4584         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4585         return sprintf(buf, "%d:%d\n",
4586                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4587 }
4588 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4589
4590 static ssize_t intel_iommu_show_address(struct device *dev,
4591                                         struct device_attribute *attr,
4592                                         char *buf)
4593 {
4594         struct intel_iommu *iommu = dev_get_drvdata(dev);
4595         return sprintf(buf, "%llx\n", iommu->reg_phys);
4596 }
4597 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4598
4599 static ssize_t intel_iommu_show_cap(struct device *dev,
4600                                     struct device_attribute *attr,
4601                                     char *buf)
4602 {
4603         struct intel_iommu *iommu = dev_get_drvdata(dev);
4604         return sprintf(buf, "%llx\n", iommu->cap);
4605 }
4606 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4607
4608 static ssize_t intel_iommu_show_ecap(struct device *dev,
4609                                     struct device_attribute *attr,
4610                                     char *buf)
4611 {
4612         struct intel_iommu *iommu = dev_get_drvdata(dev);
4613         return sprintf(buf, "%llx\n", iommu->ecap);
4614 }
4615 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4616
4617 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4618                                       struct device_attribute *attr,
4619                                       char *buf)
4620 {
4621         struct intel_iommu *iommu = dev_get_drvdata(dev);
4622         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4623 }
4624 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4625
4626 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4627                                            struct device_attribute *attr,
4628                                            char *buf)
4629 {
4630         struct intel_iommu *iommu = dev_get_drvdata(dev);
4631         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4632                                                   cap_ndoms(iommu->cap)));
4633 }
4634 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4635
4636 static struct attribute *intel_iommu_attrs[] = {
4637         &dev_attr_version.attr,
4638         &dev_attr_address.attr,
4639         &dev_attr_cap.attr,
4640         &dev_attr_ecap.attr,
4641         &dev_attr_domains_supported.attr,
4642         &dev_attr_domains_used.attr,
4643         NULL,
4644 };
4645
4646 static struct attribute_group intel_iommu_group = {
4647         .name = "intel-iommu",
4648         .attrs = intel_iommu_attrs,
4649 };
4650
4651 const struct attribute_group *intel_iommu_groups[] = {
4652         &intel_iommu_group,
4653         NULL,
4654 };
4655
4656 int __init intel_iommu_init(void)
4657 {
4658         int ret = -ENODEV;
4659         struct dmar_drhd_unit *drhd;
4660         struct intel_iommu *iommu;
4661
4662         /* VT-d is required for a TXT/tboot launch, so enforce that */
4663         force_on = tboot_force_iommu();
4664
4665         if (iommu_init_mempool()) {
4666                 if (force_on)
4667                         panic("tboot: Failed to initialize iommu memory\n");
4668                 return -ENOMEM;
4669         }
4670
4671         down_write(&dmar_global_lock);
4672         if (dmar_table_init()) {
4673                 if (force_on)
4674                         panic("tboot: Failed to initialize DMAR table\n");
4675                 goto out_free_dmar;
4676         }
4677
4678         if (dmar_dev_scope_init() < 0) {
4679                 if (force_on)
4680                         panic("tboot: Failed to initialize DMAR device scope\n");
4681                 goto out_free_dmar;
4682         }
4683
4684         if (no_iommu || dmar_disabled)
4685                 goto out_free_dmar;
4686
4687         if (list_empty(&dmar_rmrr_units))
4688                 pr_info("No RMRR found\n");
4689
4690         if (list_empty(&dmar_atsr_units))
4691                 pr_info("No ATSR found\n");
4692
4693         if (dmar_init_reserved_ranges()) {
4694                 if (force_on)
4695                         panic("tboot: Failed to reserve iommu ranges\n");
4696                 goto out_free_reserved_range;
4697         }
4698
4699         if (dmar_map_gfx)
4700                 intel_iommu_gfx_mapped = 1;
4701
4702         init_no_remapping_devices();
4703
4704         ret = init_dmars();
4705         if (ret) {
4706                 if (force_on)
4707                         panic("tboot: Failed to initialize DMARs\n");
4708                 pr_err("Initialization failed\n");
4709                 goto out_free_reserved_range;
4710         }
4711         up_write(&dmar_global_lock);
4712         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4713
4714         init_timer(&unmap_timer);
4715 #ifdef CONFIG_SWIOTLB
4716         swiotlb = 0;
4717 #endif
4718         dma_ops = &intel_dma_ops;
4719
4720         init_iommu_pm_ops();
4721
4722         for_each_active_iommu(iommu, drhd)
4723                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4724                                                        intel_iommu_groups,
4725                                                        "%s", iommu->name);
4726
4727         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4728         bus_register_notifier(&pci_bus_type, &device_nb);
4729         if (si_domain && !hw_pass_through)
4730                 register_memory_notifier(&intel_iommu_memory_nb);
4731
4732         intel_iommu_enabled = 1;
4733
4734         return 0;
4735
4736 out_free_reserved_range:
4737         put_iova_domain(&reserved_iova_list);
4738 out_free_dmar:
4739         intel_iommu_free_dmars();
4740         up_write(&dmar_global_lock);
4741         iommu_exit_mempool();
4742         return ret;
4743 }
4744
4745 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4746 {
4747         struct intel_iommu *iommu = opaque;
4748
4749         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4750         return 0;
4751 }
4752
4753 /*
4754  * NB - intel-iommu lacks any sort of reference counting for the users of
4755  * dependent devices.  If multiple endpoints have intersecting dependent
4756  * devices, unbinding the driver from any one of them will possibly leave
4757  * the others unable to operate.
4758  */
4759 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4760 {
4761         if (!iommu || !dev || !dev_is_pci(dev))
4762                 return;
4763
4764         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4765 }
4766
4767 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4768 {
4769         struct intel_iommu *iommu;
4770         unsigned long flags;
4771
4772         assert_spin_locked(&device_domain_lock);
4773
4774         if (WARN_ON(!info))
4775                 return;
4776
4777         iommu = info->iommu;
4778
4779         if (info->dev) {
4780                 iommu_disable_dev_iotlb(info);
4781                 domain_context_clear(iommu, info->dev);
4782         }
4783
4784         unlink_domain_info(info);
4785
4786         spin_lock_irqsave(&iommu->lock, flags);
4787         domain_detach_iommu(info->domain, iommu);
4788         spin_unlock_irqrestore(&iommu->lock, flags);
4789
4790         free_devinfo_mem(info);
4791 }
4792
4793 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4794                                      struct device *dev)
4795 {
4796         struct device_domain_info *info;
4797         unsigned long flags;
4798
4799         spin_lock_irqsave(&device_domain_lock, flags);
4800         info = dev->archdata.iommu;
4801         __dmar_remove_one_dev_info(info);
4802         spin_unlock_irqrestore(&device_domain_lock, flags);
4803 }
4804
4805 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4806 {
4807         int adjust_width;
4808
4809         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4810                         DMA_32BIT_PFN);
4811         domain_reserve_special_ranges(domain);
4812
4813         /* calculate AGAW */
4814         domain->gaw = guest_width;
4815         adjust_width = guestwidth_to_adjustwidth(guest_width);
4816         domain->agaw = width_to_agaw(adjust_width);
4817
4818         domain->iommu_coherency = 0;
4819         domain->iommu_snooping = 0;
4820         domain->iommu_superpage = 0;
4821         domain->max_addr = 0;
4822
4823         /* always allocate the top pgd */
4824         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4825         if (!domain->pgd)
4826                 return -ENOMEM;
4827         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4828         return 0;
4829 }
4830
4831 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4832 {
4833         struct dmar_domain *dmar_domain;
4834         struct iommu_domain *domain;
4835
4836         if (type != IOMMU_DOMAIN_UNMANAGED)
4837                 return NULL;
4838
4839         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4840         if (!dmar_domain) {
4841                 pr_err("Can't allocate dmar_domain\n");
4842                 return NULL;
4843         }
4844         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4845                 pr_err("Domain initialization failed\n");
4846                 domain_exit(dmar_domain);
4847                 return NULL;
4848         }
4849         domain_update_iommu_cap(dmar_domain);
4850
4851         domain = &dmar_domain->domain;
4852         domain->geometry.aperture_start = 0;
4853         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4854         domain->geometry.force_aperture = true;
4855
4856         return domain;
4857 }
4858
4859 static void intel_iommu_domain_free(struct iommu_domain *domain)
4860 {
4861         domain_exit(to_dmar_domain(domain));
4862 }
4863
4864 static int intel_iommu_attach_device(struct iommu_domain *domain,
4865                                      struct device *dev)
4866 {
4867         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4868         struct intel_iommu *iommu;
4869         int addr_width;
4870         u8 bus, devfn;
4871
4872         if (device_is_rmrr_locked(dev)) {
4873                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4874                 return -EPERM;
4875         }
4876
4877         /* normally dev is not mapped */
4878         if (unlikely(domain_context_mapped(dev))) {
4879                 struct dmar_domain *old_domain;
4880
4881                 old_domain = find_domain(dev);
4882                 if (old_domain) {
4883                         rcu_read_lock();
4884                         dmar_remove_one_dev_info(old_domain, dev);
4885                         rcu_read_unlock();
4886
4887                         if (!domain_type_is_vm_or_si(old_domain) &&
4888                              list_empty(&old_domain->devices))
4889                                 domain_exit(old_domain);
4890                 }
4891         }
4892
4893         iommu = device_to_iommu(dev, &bus, &devfn);
4894         if (!iommu)
4895                 return -ENODEV;
4896
4897         /* check if this iommu agaw is sufficient for max mapped address */
4898         addr_width = agaw_to_width(iommu->agaw);
4899         if (addr_width > cap_mgaw(iommu->cap))
4900                 addr_width = cap_mgaw(iommu->cap);
4901
4902         if (dmar_domain->max_addr > (1LL << addr_width)) {
4903                 pr_err("%s: iommu width (%d) is not "
4904                        "sufficient for the mapped address (%llx)\n",
4905                        __func__, addr_width, dmar_domain->max_addr);
4906                 return -EFAULT;
4907         }
4908         dmar_domain->gaw = addr_width;
4909
4910         /*
4911          * Knock out extra levels of page tables if necessary
4912          */
4913         while (iommu->agaw < dmar_domain->agaw) {
4914                 struct dma_pte *pte;
4915
4916                 pte = dmar_domain->pgd;
4917                 if (dma_pte_present(pte)) {
4918                         dmar_domain->pgd = (struct dma_pte *)
4919                                 phys_to_virt(dma_pte_addr(pte));
4920                         free_pgtable_page(pte);
4921                 }
4922                 dmar_domain->agaw--;
4923         }
4924
4925         return domain_add_dev_info(dmar_domain, dev);
4926 }
4927
4928 static void intel_iommu_detach_device(struct iommu_domain *domain,
4929                                       struct device *dev)
4930 {
4931         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4932 }
4933
4934 static int intel_iommu_map(struct iommu_domain *domain,
4935                            unsigned long iova, phys_addr_t hpa,
4936                            size_t size, int iommu_prot)
4937 {
4938         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4939         u64 max_addr;
4940         int prot = 0;
4941         int ret;
4942
4943         if (iommu_prot & IOMMU_READ)
4944                 prot |= DMA_PTE_READ;
4945         if (iommu_prot & IOMMU_WRITE)
4946                 prot |= DMA_PTE_WRITE;
4947         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4948                 prot |= DMA_PTE_SNP;
4949
4950         max_addr = iova + size;
4951         if (dmar_domain->max_addr < max_addr) {
4952                 u64 end;
4953
4954                 /* check if minimum agaw is sufficient for mapped address */
4955                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4956                 if (end < max_addr) {
4957                         pr_err("%s: iommu width (%d) is not "
4958                                "sufficient for the mapped address (%llx)\n",
4959                                __func__, dmar_domain->gaw, max_addr);
4960                         return -EFAULT;
4961                 }
4962                 dmar_domain->max_addr = max_addr;
4963         }
4964         /* Round up size to next multiple of PAGE_SIZE, if it and
4965            the low bits of hpa would take us onto the next page */
4966         size = aligned_nrpages(hpa, size);
4967         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4968                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4969         return ret;
4970 }
4971
4972 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4973                                 unsigned long iova, size_t size)
4974 {
4975         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4976         struct page *freelist = NULL;
4977         struct intel_iommu *iommu;
4978         unsigned long start_pfn, last_pfn;
4979         unsigned int npages;
4980         int iommu_id, level = 0;
4981
4982         /* Cope with horrid API which requires us to unmap more than the
4983            size argument if it happens to be a large-page mapping. */
4984         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4985
4986         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4987                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4988
4989         start_pfn = iova >> VTD_PAGE_SHIFT;
4990         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4991
4992         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4993
4994         npages = last_pfn - start_pfn + 1;
4995
4996         for_each_domain_iommu(iommu_id, dmar_domain) {
4997                 iommu = g_iommus[iommu_id];
4998
4999                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5000                                       start_pfn, npages, !freelist, 0);
5001         }
5002
5003         dma_free_pagelist(freelist);
5004
5005         if (dmar_domain->max_addr == iova + size)
5006                 dmar_domain->max_addr = iova;
5007
5008         return size;
5009 }
5010
5011 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5012                                             dma_addr_t iova)
5013 {
5014         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5015         struct dma_pte *pte;
5016         int level = 0;
5017         u64 phys = 0;
5018
5019         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5020         if (pte && dma_pte_present(pte))
5021                 phys = dma_pte_addr(pte) +
5022                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5023                                                 VTD_PAGE_SHIFT) - 1));
5024
5025         return phys;
5026 }
5027
5028 static bool intel_iommu_capable(enum iommu_cap cap)
5029 {
5030         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5031                 return domain_update_iommu_snooping(NULL) == 1;
5032         if (cap == IOMMU_CAP_INTR_REMAP)
5033                 return irq_remapping_enabled == 1;
5034
5035         return false;
5036 }
5037
5038 static int intel_iommu_add_device(struct device *dev)
5039 {
5040         struct intel_iommu *iommu;
5041         struct iommu_group *group;
5042         u8 bus, devfn;
5043
5044         iommu = device_to_iommu(dev, &bus, &devfn);
5045         if (!iommu)
5046                 return -ENODEV;
5047
5048         iommu_device_link(iommu->iommu_dev, dev);
5049
5050         group = iommu_group_get_for_dev(dev);
5051
5052         if (IS_ERR(group))
5053                 return PTR_ERR(group);
5054
5055         iommu_group_put(group);
5056         return 0;
5057 }
5058
5059 static void intel_iommu_remove_device(struct device *dev)
5060 {
5061         struct intel_iommu *iommu;
5062         u8 bus, devfn;
5063
5064         iommu = device_to_iommu(dev, &bus, &devfn);
5065         if (!iommu)
5066                 return;
5067
5068         iommu_group_remove_device(dev);
5069
5070         iommu_device_unlink(iommu->iommu_dev, dev);
5071 }
5072
5073 #ifdef CONFIG_INTEL_IOMMU_SVM
5074 #define MAX_NR_PASID_BITS (20)
5075 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5076 {
5077         /*
5078          * Convert ecap_pss to extend context entry pts encoding, also
5079          * respect the soft pasid_max value set by the iommu.
5080          * - number of PASID bits = ecap_pss + 1
5081          * - number of PASID table entries = 2^(pts + 5)
5082          * Therefore, pts = ecap_pss - 4
5083          * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5084          */
5085         if (ecap_pss(iommu->ecap) < 5)
5086                 return 0;
5087
5088         /* pasid_max is encoded as actual number of entries not the bits */
5089         return find_first_bit((unsigned long *)&iommu->pasid_max,
5090                         MAX_NR_PASID_BITS) - 5;
5091 }
5092
5093 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5094 {
5095         struct device_domain_info *info;
5096         struct context_entry *context;
5097         struct dmar_domain *domain;
5098         unsigned long flags;
5099         u64 ctx_lo;
5100         int ret;
5101
5102         domain = get_valid_domain_for_dev(sdev->dev);
5103         if (!domain)
5104                 return -EINVAL;
5105
5106         spin_lock_irqsave(&device_domain_lock, flags);
5107         spin_lock(&iommu->lock);
5108
5109         ret = -EINVAL;
5110         info = sdev->dev->archdata.iommu;
5111         if (!info || !info->pasid_supported)
5112                 goto out;
5113
5114         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5115         if (WARN_ON(!context))
5116                 goto out;
5117
5118         ctx_lo = context[0].lo;
5119
5120         sdev->did = domain->iommu_did[iommu->seq_id];
5121         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5122
5123         if (!(ctx_lo & CONTEXT_PASIDE)) {
5124                 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5125                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5126                         intel_iommu_get_pts(iommu);
5127
5128                 wmb();
5129                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5130                  * extended to permit requests-with-PASID if the PASIDE bit
5131                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5132                  * however, the PASIDE bit is ignored and requests-with-PASID
5133                  * are unconditionally blocked. Which makes less sense.
5134                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5135                  * "guest mode" translation types depending on whether ATS
5136                  * is available or not. Annoyingly, we can't use the new
5137                  * modes *unless* PASIDE is set. */
5138                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5139                         ctx_lo &= ~CONTEXT_TT_MASK;
5140                         if (info->ats_supported)
5141                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5142                         else
5143                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5144                 }
5145                 ctx_lo |= CONTEXT_PASIDE;
5146                 if (iommu->pasid_state_table)
5147                         ctx_lo |= CONTEXT_DINVE;
5148                 if (info->pri_supported)
5149                         ctx_lo |= CONTEXT_PRS;
5150                 context[0].lo = ctx_lo;
5151                 wmb();
5152                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5153                                            DMA_CCMD_MASK_NOBIT,
5154                                            DMA_CCMD_DEVICE_INVL);
5155         }
5156
5157         /* Enable PASID support in the device, if it wasn't already */
5158         if (!info->pasid_enabled)
5159                 iommu_enable_dev_iotlb(info);
5160
5161         if (info->ats_enabled) {
5162                 sdev->dev_iotlb = 1;
5163                 sdev->qdep = info->ats_qdep;
5164                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5165                         sdev->qdep = 0;
5166         }
5167         ret = 0;
5168
5169  out:
5170         spin_unlock(&iommu->lock);
5171         spin_unlock_irqrestore(&device_domain_lock, flags);
5172
5173         return ret;
5174 }
5175
5176 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5177 {
5178         struct intel_iommu *iommu;
5179         u8 bus, devfn;
5180
5181         if (iommu_dummy(dev)) {
5182                 dev_warn(dev,
5183                          "No IOMMU translation for device; cannot enable SVM\n");
5184                 return NULL;
5185         }
5186
5187         iommu = device_to_iommu(dev, &bus, &devfn);
5188         if ((!iommu)) {
5189                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5190                 return NULL;
5191         }
5192
5193         if (!iommu->pasid_table) {
5194                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5195                 return NULL;
5196         }
5197
5198         return iommu;
5199 }
5200 #endif /* CONFIG_INTEL_IOMMU_SVM */
5201
5202 static const struct iommu_ops intel_iommu_ops = {
5203         .capable        = intel_iommu_capable,
5204         .domain_alloc   = intel_iommu_domain_alloc,
5205         .domain_free    = intel_iommu_domain_free,
5206         .attach_dev     = intel_iommu_attach_device,
5207         .detach_dev     = intel_iommu_detach_device,
5208         .map            = intel_iommu_map,
5209         .unmap          = intel_iommu_unmap,
5210         .map_sg         = default_iommu_map_sg,
5211         .iova_to_phys   = intel_iommu_iova_to_phys,
5212         .add_device     = intel_iommu_add_device,
5213         .remove_device  = intel_iommu_remove_device,
5214         .device_group   = pci_device_group,
5215         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
5216 };
5217
5218 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5219 {
5220         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5221         pr_info("Disabling IOMMU for graphics on this chipset\n");
5222         dmar_map_gfx = 0;
5223 }
5224
5225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5232
5233 static void quirk_iommu_rwbf(struct pci_dev *dev)
5234 {
5235         /*
5236          * Mobile 4 Series Chipset neglects to set RWBF capability,
5237          * but needs it. Same seems to hold for the desktop versions.
5238          */
5239         pr_info("Forcing write-buffer flush capability\n");
5240         rwbf_quirk = 1;
5241 }
5242
5243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5250
5251 #define GGC 0x52
5252 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5253 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5254 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5255 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5256 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5257 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5258 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5259 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5260
5261 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5262 {
5263         unsigned short ggc;
5264
5265         if (pci_read_config_word(dev, GGC, &ggc))
5266                 return;
5267
5268         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5269                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5270                 dmar_map_gfx = 0;
5271         } else if (dmar_map_gfx) {
5272                 /* we have to ensure the gfx device is idle before we flush */
5273                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5274                 intel_iommu_strict = 1;
5275        }
5276 }
5277 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5278 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5279 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5280 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5281
5282 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5283    ISOCH DMAR unit for the Azalia sound device, but not give it any
5284    TLB entries, which causes it to deadlock. Check for that.  We do
5285    this in a function called from init_dmars(), instead of in a PCI
5286    quirk, because we don't want to print the obnoxious "BIOS broken"
5287    message if VT-d is actually disabled.
5288 */
5289 static void __init check_tylersburg_isoch(void)
5290 {
5291         struct pci_dev *pdev;
5292         uint32_t vtisochctrl;
5293
5294         /* If there's no Azalia in the system anyway, forget it. */
5295         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5296         if (!pdev)
5297                 return;
5298         pci_dev_put(pdev);
5299
5300         /* System Management Registers. Might be hidden, in which case
5301            we can't do the sanity check. But that's OK, because the
5302            known-broken BIOSes _don't_ actually hide it, so far. */
5303         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5304         if (!pdev)
5305                 return;
5306
5307         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5308                 pci_dev_put(pdev);
5309                 return;
5310         }
5311
5312         pci_dev_put(pdev);
5313
5314         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5315         if (vtisochctrl & 1)
5316                 return;
5317
5318         /* Drop all bits other than the number of TLB entries */
5319         vtisochctrl &= 0x1c;
5320
5321         /* If we have the recommended number of TLB entries (16), fine. */
5322         if (vtisochctrl == 0x10)
5323                 return;
5324
5325         /* Zero TLB entries? You get to ride the short bus to school. */
5326         if (!vtisochctrl) {
5327                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5328                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5329                      dmi_get_system_info(DMI_BIOS_VENDOR),
5330                      dmi_get_system_info(DMI_BIOS_VERSION),
5331                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5332                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5333                 return;
5334         }
5335
5336         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5337                vtisochctrl);
5338 }