drivers/iommu/intel-iommu.c

   1 /*
   2  * Copyright © 2006-2014 Intel Corporation.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  * Authors: David Woodhouse <dwmw2@infradead.org>,
  14  *          Ashok Raj <ashok.raj@intel.com>,
  15  *          Shaohua Li <shaohua.li@intel.com>,
  16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17  *          Fenghua Yu <fenghua.yu@intel.com>
  18  *          Joerg Roedel <jroedel@suse.de>
  19  */
  20
  21 #define pr_fmt(fmt)     "DMAR: " fmt
  22
  23 #include <linux/init.h>
  24 #include <linux/bitmap.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/export.h>
  27 #include <linux/slab.h>
  28 #include <linux/irq.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/spinlock.h>
  31 #include <linux/pci.h>
  32 #include <linux/dmar.h>
  33 #include <linux/dma-mapping.h>
  34 #include <linux/mempool.h>
  35 #include <linux/memory.h>
  36 #include <linux/cpu.h>
  37 #include <linux/timer.h>
  38 #include <linux/io.h>
  39 #include <linux/iova.h>
  40 #include <linux/iommu.h>
  41 #include <linux/intel-iommu.h>
  42 #include <linux/syscore_ops.h>
  43 #include <linux/tboot.h>
  44 #include <linux/dmi.h>
  45 #include <linux/pci-ats.h>
  46 #include <linux/memblock.h>
  47 #include <linux/dma-contiguous.h>
  48 #include <linux/crash_dump.h>
  49 #include <asm/irq_remapping.h>
  50 #include <asm/cacheflush.h>
  51 #include <asm/iommu.h>
  52
  53 #include "irq_remapping.h"
  54
  55 #define ROOT_SIZE               VTD_PAGE_SIZE
  56 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  57
  58 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  59 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  60 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  61 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  62
  63 #define IOAPIC_RANGE_START      (0xfee00000)
  64 #define IOAPIC_RANGE_END        (0xfeefffff)
  65 #define IOVA_START_ADDR         (0x1000)
  66
  67 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  68
  69 #define MAX_AGAW_WIDTH 64
  70 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  71
  72 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  73 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  74
  75 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  76    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  77 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  78                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  79 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  80
  81 /* IO virtual address start page frame number */
  82 #define IOVA_START_PFN          (1)
  83
  84 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  85 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  86 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  87
  88 /* page table handling */
  89 #define LEVEL_STRIDE            (9)
  90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  91
  92 /*
  93  * This bitmap is used to advertise the page sizes our hardware support
  94  * to the IOMMU core, which will then use this information to split
  95  * physically contiguous memory regions it is mapping into page sizes
  96  * that we support.
  97  *
  98  * Traditionally the IOMMU core just handed us the mappings directly,
  99  * after making sure the size is an order of a 4KiB page and that the
 100  * mapping has natural alignment.
 101  *
 102  * To retain this behavior, we currently advertise that we support
 103  * all page sizes that are an order of 4KiB.
 104  *
 105  * If at some point we'd like to utilize the IOMMU core's new behavior,
 106  * we could change this to advertise the real page sizes we support.
 107  */
 108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 109
 110 static inline int agaw_to_level(int agaw)
 111 {
 112         return agaw + 2;
 113 }
 114
 115 static inline int agaw_to_width(int agaw)
 116 {
 117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 118 }
 119
 120 static inline int width_to_agaw(int width)
 121 {
 122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 123 }
 124
 125 static inline unsigned int level_to_offset_bits(int level)
 126 {
 127         return (level - 1) * LEVEL_STRIDE;
 128 }
 129
 130 static inline int pfn_level_offset(unsigned long pfn, int level)
 131 {
 132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 133 }
 134
 135 static inline unsigned long level_mask(int level)
 136 {
 137         return -1UL << level_to_offset_bits(level);
 138 }
 139
 140 static inline unsigned long level_size(int level)
 141 {
 142         return 1UL << level_to_offset_bits(level);
 143 }
 144
 145 static inline unsigned long align_to_level(unsigned long pfn, int level)
 146 {
 147         return (pfn + level_size(level) - 1) & level_mask(level);
 148 }
 149
 150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 151 {
 152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 153 }
 154
 155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 156    are never going to work. */
 157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 158 {
 159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 160 }
 161
 162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 163 {
 164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 165 }
 166 static inline unsigned long page_to_dma_pfn(struct page *pg)
 167 {
 168         return mm_to_dma_pfn(page_to_pfn(pg));
 169 }
 170 static inline unsigned long virt_to_dma_pfn(void *p)
 171 {
 172         return page_to_dma_pfn(virt_to_page(p));
 173 }
 174
 175 /* global iommu list, set NULL for ignored DMAR units */
 176 static struct intel_iommu **g_iommus;
 177
 178 static void __init check_tylersburg_isoch(void);
 179 static int rwbf_quirk;
 180
 181 /*
 182  * set to 1 to panic kernel if can't successfully enable VT-d
 183  * (used when kernel is launched w/ TXT)
 184  */
 185 static int force_on = 0;
 186 int intel_iommu_tboot_noforce;
 187
 188 /*
 189  * 0: Present
 190  * 1-11: Reserved
 191  * 12-63: Context Ptr (12 - (haw-1))
 192  * 64-127: Reserved
 193  */
 194 struct root_entry {
 195         u64     lo;
 196         u64     hi;
 197 };
 198 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 199
 200 /*
 201  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 202  * if marked present.
 203  */
 204 static phys_addr_t root_entry_lctp(struct root_entry *re)
 205 {
 206         if (!(re->lo & 1))
 207                 return 0;
 208
 209         return re->lo & VTD_PAGE_MASK;
 210 }
 211
 212 /*
 213  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 214  * if marked present.
 215  */
 216 static phys_addr_t root_entry_uctp(struct root_entry *re)
 217 {
 218         if (!(re->hi & 1))
 219                 return 0;
 220
 221         return re->hi & VTD_PAGE_MASK;
 222 }
 223 /*
 224  * low 64 bits:
 225  * 0: present
 226  * 1: fault processing disable
 227  * 2-3: translation type
 228  * 12-63: address space root
 229  * high 64 bits:
 230  * 0-2: address width
 231  * 3-6: aval
 232  * 8-23: domain id
 233  */
 234 struct context_entry {
 235         u64 lo;
 236         u64 hi;
 237 };
 238
 239 static inline void context_clear_pasid_enable(struct context_entry *context)
 240 {
 241         context->lo &= ~(1ULL << 11);
 242 }
 243
 244 static inline bool context_pasid_enabled(struct context_entry *context)
 245 {
 246         return !!(context->lo & (1ULL << 11));
 247 }
 248
 249 static inline void context_set_copied(struct context_entry *context)
 250 {
 251         context->hi |= (1ull << 3);
 252 }
 253
 254 static inline bool context_copied(struct context_entry *context)
 255 {
 256         return !!(context->hi & (1ULL << 3));
 257 }
 258
 259 static inline bool __context_present(struct context_entry *context)
 260 {
 261         return (context->lo & 1);
 262 }
 263
 264 static inline bool context_present(struct context_entry *context)
 265 {
 266         return context_pasid_enabled(context) ?
 267              __context_present(context) :
 268              __context_present(context) && !context_copied(context);
 269 }
 270
 271 static inline void context_set_present(struct context_entry *context)
 272 {
 273         context->lo |= 1;
 274 }
 275
 276 static inline void context_set_fault_enable(struct context_entry *context)
 277 {
 278         context->lo &= (((u64)-1) << 2) | 1;
 279 }
 280
 281 static inline void context_set_translation_type(struct context_entry *context,
 282                                                 unsigned long value)
 283 {
 284         context->lo &= (((u64)-1) << 4) | 3;
 285         context->lo |= (value & 3) << 2;
 286 }
 287
 288 static inline void context_set_address_root(struct context_entry *context,
 289                                             unsigned long value)
 290 {
 291         context->lo &= ~VTD_PAGE_MASK;
 292         context->lo |= value & VTD_PAGE_MASK;
 293 }
 294
 295 static inline void context_set_address_width(struct context_entry *context,
 296                                              unsigned long value)
 297 {
 298         context->hi |= value & 7;
 299 }
 300
 301 static inline void context_set_domain_id(struct context_entry *context,
 302                                          unsigned long value)
 303 {
 304         context->hi |= (value & ((1 << 16) - 1)) << 8;
 305 }
 306
 307 static inline int context_domain_id(struct context_entry *c)
 308 {
 309         return((c->hi >> 8) & 0xffff);
 310 }
 311
 312 static inline void context_clear_entry(struct context_entry *context)
 313 {
 314         context->lo = 0;
 315         context->hi = 0;
 316 }
 317
 318 /*
 319  * 0: readable
 320  * 1: writable
 321  * 2-6: reserved
 322  * 7: super page
 323  * 8-10: available
 324  * 11: snoop behavior
 325  * 12-63: Host physcial address
 326  */
 327 struct dma_pte {
 328         u64 val;
 329 };
 330
 331 static inline void dma_clear_pte(struct dma_pte *pte)
 332 {
 333         pte->val = 0;
 334 }
 335
 336 static inline u64 dma_pte_addr(struct dma_pte *pte)
 337 {
 338 #ifdef CONFIG_64BIT
 339         return pte->val & VTD_PAGE_MASK;
 340 #else
 341         /* Must have a full atomic 64-bit read */
 342         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 343 #endif
 344 }
 345
 346 static inline bool dma_pte_present(struct dma_pte *pte)
 347 {
 348         return (pte->val & 3) != 0;
 349 }
 350
 351 static inline bool dma_pte_superpage(struct dma_pte *pte)
 352 {
 353         return (pte->val & DMA_PTE_LARGE_PAGE);
 354 }
 355
 356 static inline int first_pte_in_page(struct dma_pte *pte)
 357 {
 358         return !((unsigned long)pte & ~VTD_PAGE_MASK);
 359 }
 360
 361 /*
 362  * This domain is a statically identity mapping domain.
 363  *      1. This domain creats a static 1:1 mapping to all usable memory.
 364  *      2. It maps to each iommu if successful.
 365  *      3. Each iommu mapps to this domain if successful.
 366  */
 367 static struct dmar_domain *si_domain;
 368 static int hw_pass_through = 1;
 369
 370 /*
 371  * Domain represents a virtual machine, more than one devices
 372  * across iommus may be owned in one domain, e.g. kvm guest.
 373  */
 374 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 375
 376 /* si_domain contains mulitple devices */
 377 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 378
 379 #define for_each_domain_iommu(idx, domain)                      \
 380         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 381                 if (domain->iommu_refcnt[idx])
 382
 383 struct dmar_domain {
 384         int     nid;                    /* node id */
 385
 386         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
 387                                         /* Refcount of devices per iommu */
 388
 389
 390         u16             iommu_did[DMAR_UNITS_SUPPORTED];
 391                                         /* Domain ids per IOMMU. Use u16 since
 392                                          * domain ids are 16 bit wide according
 393                                          * to VT-d spec, section 9.3 */
 394
 395         bool has_iotlb_device;
 396         struct list_head devices;       /* all devices' list */
 397         struct iova_domain iovad;       /* iova's that belong to this domain */
 398
 399         struct dma_pte  *pgd;           /* virtual address */
 400         int             gaw;            /* max guest address width */
 401
 402         /* adjusted guest address width, 0 is level 2 30-bit */
 403         int             agaw;
 404
 405         int             flags;          /* flags to find out type of domain */
 406
 407         int             iommu_coherency;/* indicate coherency of iommu access */
 408         int             iommu_snooping; /* indicate snooping control feature*/
 409         int             iommu_count;    /* reference count of iommu */
 410         int             iommu_superpage;/* Level of superpages supported:
 411                                            0 == 4KiB (no superpages), 1 == 2MiB,
 412                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 413         u64             max_addr;       /* maximum mapped address */
 414
 415         struct iommu_domain domain;     /* generic domain data structure for
 416                                            iommu core */
 417 };
 418
 419 /* PCI domain-device relationship */
 420 struct device_domain_info {
 421         struct list_head link;  /* link to domain siblings */
 422         struct list_head global; /* link to global list */
 423         u8 bus;                 /* PCI bus number */
 424         u8 devfn;               /* PCI devfn number */
 425         u16 pfsid;              /* SRIOV physical function source ID */
 426         u8 pasid_supported:3;
 427         u8 pasid_enabled:1;
 428         u8 pri_supported:1;
 429         u8 pri_enabled:1;
 430         u8 ats_supported:1;
 431         u8 ats_enabled:1;
 432         u8 ats_qdep;
 433         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
 434         struct intel_iommu *iommu; /* IOMMU used by this device */
 435         struct dmar_domain *domain; /* pointer to domain */
 436 };
 437
 438 struct dmar_rmrr_unit {
 439         struct list_head list;          /* list of rmrr units   */
 440         struct acpi_dmar_header *hdr;   /* ACPI header          */
 441         u64     base_address;           /* reserved base address*/
 442         u64     end_address;            /* reserved end address */
 443         struct dmar_dev_scope *devices; /* target devices */
 444         int     devices_cnt;            /* target device count */
 445 };
 446
 447 struct dmar_atsr_unit {
 448         struct list_head list;          /* list of ATSR units */
 449         struct acpi_dmar_header *hdr;   /* ACPI header */
 450         struct dmar_dev_scope *devices; /* target devices */
 451         int devices_cnt;                /* target device count */
 452         u8 include_all:1;               /* include all ports */
 453 };
 454
 455 static LIST_HEAD(dmar_atsr_units);
 456 static LIST_HEAD(dmar_rmrr_units);
 457
 458 #define for_each_rmrr_units(rmrr) \
 459         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 460
 461 /* bitmap for indexing intel_iommus */
 462 static int g_num_of_iommus;
 463
 464 static void domain_exit(struct dmar_domain *domain);
 465 static void domain_remove_dev_info(struct dmar_domain *domain);
 466 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 467                                      struct device *dev);
 468 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 469 static void domain_context_clear(struct intel_iommu *iommu,
 470                                  struct device *dev);
 471 static int domain_detach_iommu(struct dmar_domain *domain,
 472                                struct intel_iommu *iommu);
 473
 474 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 475 int dmar_disabled = 0;
 476 #else
 477 int dmar_disabled = 1;
 478 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 479
 480 int intel_iommu_enabled = 0;
 481 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 482
 483 static int dmar_map_gfx = 1;
 484 static int dmar_forcedac;
 485 static int intel_iommu_strict;
 486 static int intel_iommu_superpage = 1;
 487 static int intel_iommu_ecs = 1;
 488 static int intel_iommu_pasid28;
 489 static int iommu_identity_mapping;
 490
 491 #define IDENTMAP_ALL            1
 492 #define IDENTMAP_GFX            2
 493 #define IDENTMAP_AZALIA         4
 494
 495 /* Broadwell and Skylake have broken ECS support — normal so-called "second
 496  * level" translation of DMA requests-without-PASID doesn't actually happen
 497  * unless you also set the NESTE bit in an extended context-entry. Which of
 498  * course means that SVM doesn't work because it's trying to do nested
 499  * translation of the physical addresses it finds in the process page tables,
 500  * through the IOVA->phys mapping found in the "second level" page tables.
 501  *
 502  * The VT-d specification was retroactively changed to change the definition
 503  * of the capability bits and pretend that Broadwell/Skylake never happened...
 504  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
 505  * for some reason it was the PASID capability bit which was redefined (from
 506  * bit 28 on BDW/SKL to bit 40 in future).
 507  *
 508  * So our test for ECS needs to eschew those implementations which set the old
 509  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
 510  * Unless we are working around the 'pasid28' limitations, that is, by putting
 511  * the device into passthrough mode for normal DMA and thus masking the bug.
 512  */
 513 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
 514                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
 515 /* PASID support is thus enabled if ECS is enabled and *either* of the old
 516  * or new capability bits are set. */
 517 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
 518                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
 519
 520 int intel_iommu_gfx_mapped;
 521 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 522
 523 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 524 static DEFINE_SPINLOCK(device_domain_lock);
 525 static LIST_HEAD(device_domain_list);
 526
 527 const struct iommu_ops intel_iommu_ops;
 528
 529 static bool translation_pre_enabled(struct intel_iommu *iommu)
 530 {
 531         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 532 }
 533
 534 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 535 {
 536         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 537 }
 538
 539 static void init_translation_status(struct intel_iommu *iommu)
 540 {
 541         u32 gsts;
 542
 543         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 544         if (gsts & DMA_GSTS_TES)
 545                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 546 }
 547
 548 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 549 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 550 {
 551         return container_of(dom, struct dmar_domain, domain);
 552 }
 553
 554 static int __init intel_iommu_setup(char *str)
 555 {
 556         if (!str)
 557                 return -EINVAL;
 558         while (*str) {
 559                 if (!strncmp(str, "on", 2)) {
 560                         dmar_disabled = 0;
 561                         pr_info("IOMMU enabled\n");
 562                 } else if (!strncmp(str, "off", 3)) {
 563                         dmar_disabled = 1;
 564                         pr_info("IOMMU disabled\n");
 565                 } else if (!strncmp(str, "igfx_off", 8)) {
 566                         dmar_map_gfx = 0;
 567                         pr_info("Disable GFX device mapping\n");
 568                 } else if (!strncmp(str, "forcedac", 8)) {
 569                         pr_info("Forcing DAC for PCI devices\n");
 570                         dmar_forcedac = 1;
 571                 } else if (!strncmp(str, "strict", 6)) {
 572                         pr_info("Disable batched IOTLB flush\n");
 573                         intel_iommu_strict = 1;
 574                 } else if (!strncmp(str, "sp_off", 6)) {
 575                         pr_info("Disable supported super page\n");
 576                         intel_iommu_superpage = 0;
 577                 } else if (!strncmp(str, "ecs_off", 7)) {
 578                         printk(KERN_INFO
 579                                 "Intel-IOMMU: disable extended context table support\n");
 580                         intel_iommu_ecs = 0;
 581                 } else if (!strncmp(str, "pasid28", 7)) {
 582                         printk(KERN_INFO
 583                                 "Intel-IOMMU: enable pre-production PASID support\n");
 584                         intel_iommu_pasid28 = 1;
 585                         iommu_identity_mapping |= IDENTMAP_GFX;
 586                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 587                         printk(KERN_INFO
 588                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 589                         intel_iommu_tboot_noforce = 1;
 590                 }
 591
 592                 str += strcspn(str, ",");
 593                 while (*str == ',')
 594                         str++;
 595         }
 596         return 0;
 597 }
 598 __setup("intel_iommu=", intel_iommu_setup);
 599
 600 static struct kmem_cache *iommu_domain_cache;
 601 static struct kmem_cache *iommu_devinfo_cache;
 602
 603 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 604 {
 605         struct dmar_domain **domains;
 606         int idx = did >> 8;
 607
 608         domains = iommu->domains[idx];
 609         if (!domains)
 610                 return NULL;
 611
 612         return domains[did & 0xff];
 613 }
 614
 615 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 616                              struct dmar_domain *domain)
 617 {
 618         struct dmar_domain **domains;
 619         int idx = did >> 8;
 620
 621         if (!iommu->domains[idx]) {
 622                 size_t size = 256 * sizeof(struct dmar_domain *);
 623                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 624         }
 625
 626         domains = iommu->domains[idx];
 627         if (WARN_ON(!domains))
 628                 return;
 629         else
 630                 domains[did & 0xff] = domain;
 631 }
 632
 633 static inline void *alloc_pgtable_page(int node)
 634 {
 635         struct page *page;
 636         void *vaddr = NULL;
 637
 638         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 639         if (page)
 640                 vaddr = page_address(page);
 641         return vaddr;
 642 }
 643
 644 static inline void free_pgtable_page(void *vaddr)
 645 {
 646         free_page((unsigned long)vaddr);
 647 }
 648
 649 static inline void *alloc_domain_mem(void)
 650 {
 651         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 652 }
 653
 654 static void free_domain_mem(void *vaddr)
 655 {
 656         kmem_cache_free(iommu_domain_cache, vaddr);
 657 }
 658
 659 static inline void * alloc_devinfo_mem(void)
 660 {
 661         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 662 }
 663
 664 static inline void free_devinfo_mem(void *vaddr)
 665 {
 666         kmem_cache_free(iommu_devinfo_cache, vaddr);
 667 }
 668
 669 static inline int domain_type_is_vm(struct dmar_domain *domain)
 670 {
 671         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 672 }
 673
 674 static inline int domain_type_is_si(struct dmar_domain *domain)
 675 {
 676         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 677 }
 678
 679 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 680 {
 681         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 682                                 DOMAIN_FLAG_STATIC_IDENTITY);
 683 }
 684
 685 static inline int domain_pfn_supported(struct dmar_domain *domain,
 686                                        unsigned long pfn)
 687 {
 688         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 689
 690         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 691 }
 692
 693 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 694 {
 695         unsigned long sagaw;
 696         int agaw = -1;
 697
 698         sagaw = cap_sagaw(iommu->cap);
 699         for (agaw = width_to_agaw(max_gaw);
 700              agaw >= 0; agaw--) {
 701                 if (test_bit(agaw, &sagaw))
 702                         break;
 703         }
 704
 705         return agaw;
 706 }
 707
 708 /*
 709  * Calculate max SAGAW for each iommu.
 710  */
 711 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 712 {
 713         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 714 }
 715
 716 /*
 717  * calculate agaw for each iommu.
 718  * "SAGAW" may be different across iommus, use a default agaw, and
 719  * get a supported less agaw for iommus that don't support the default agaw.
 720  */
 721 int iommu_calculate_agaw(struct intel_iommu *iommu)
 722 {
 723         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 724 }
 725
 726 /* This functionin only returns single iommu in a domain */
 727 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 728 {
 729         int iommu_id;
 730
 731         /* si_domain and vm domain should not get here. */
 732         BUG_ON(domain_type_is_vm_or_si(domain));
 733         for_each_domain_iommu(iommu_id, domain)
 734                 break;
 735
 736         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 737                 return NULL;
 738
 739         return g_iommus[iommu_id];
 740 }
 741
 742 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 743 {
 744         struct dmar_drhd_unit *drhd;
 745         struct intel_iommu *iommu;
 746         bool found = false;
 747         int i;
 748
 749         domain->iommu_coherency = 1;
 750
 751         for_each_domain_iommu(i, domain) {
 752                 found = true;
 753                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 754                         domain->iommu_coherency = 0;
 755                         break;
 756                 }
 757         }
 758         if (found)
 759                 return;
 760
 761         /* No hardware attached; use lowest common denominator */
 762         rcu_read_lock();
 763         for_each_active_iommu(iommu, drhd) {
 764                 if (!ecap_coherent(iommu->ecap)) {
 765                         domain->iommu_coherency = 0;
 766                         break;
 767                 }
 768         }
 769         rcu_read_unlock();
 770 }
 771
 772 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 773 {
 774         struct dmar_drhd_unit *drhd;
 775         struct intel_iommu *iommu;
 776         int ret = 1;
 777
 778         rcu_read_lock();
 779         for_each_active_iommu(iommu, drhd) {
 780                 if (iommu != skip) {
 781                         if (!ecap_sc_support(iommu->ecap)) {
 782                                 ret = 0;
 783                                 break;
 784                         }
 785                 }
 786         }
 787         rcu_read_unlock();
 788
 789         return ret;
 790 }
 791
 792 static int domain_update_iommu_superpage(struct intel_iommu *skip)
 793 {
 794         struct dmar_drhd_unit *drhd;
 795         struct intel_iommu *iommu;
 796         int mask = 0xf;
 797
 798         if (!intel_iommu_superpage) {
 799                 return 0;
 800         }
 801
 802         /* set iommu_superpage to the smallest common denominator */
 803         rcu_read_lock();
 804         for_each_active_iommu(iommu, drhd) {
 805                 if (iommu != skip) {
 806                         mask &= cap_super_page_val(iommu->cap);
 807                         if (!mask)
 808                                 break;
 809                 }
 810         }
 811         rcu_read_unlock();
 812
 813         return fls(mask);
 814 }
 815
 816 /* Some capabilities may be different across iommus */
 817 static void domain_update_iommu_cap(struct dmar_domain *domain)
 818 {
 819         domain_update_iommu_coherency(domain);
 820         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 821         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 822 }
 823
 824 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 825                                                        u8 bus, u8 devfn, int alloc)
 826 {
 827         struct root_entry *root = &iommu->root_entry[bus];
 828         struct context_entry *context;
 829         u64 *entry;
 830
 831         entry = &root->lo;
 832         if (ecs_enabled(iommu)) {
 833                 if (devfn >= 0x80) {
 834                         devfn -= 0x80;
 835                         entry = &root->hi;
 836                 }
 837                 devfn *= 2;
 838         }
 839         if (*entry & 1)
 840                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 841         else {
 842                 unsigned long phy_addr;
 843                 if (!alloc)
 844                         return NULL;
 845
 846                 context = alloc_pgtable_page(iommu->node);
 847                 if (!context)
 848                         return NULL;
 849
 850                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 851                 phy_addr = virt_to_phys((void *)context);
 852                 *entry = phy_addr | 1;
 853                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 854         }
 855         return &context[devfn];
 856 }
 857
 858 static int iommu_dummy(struct device *dev)
 859 {
 860         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 861 }
 862
 863 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 864 {
 865         struct dmar_drhd_unit *drhd = NULL;
 866         struct intel_iommu *iommu;
 867         struct device *tmp;
 868         struct pci_dev *ptmp, *pdev = NULL;
 869         u16 segment = 0;
 870         int i;
 871
 872         if (iommu_dummy(dev))
 873                 return NULL;
 874
 875         if (dev_is_pci(dev)) {
 876                 struct pci_dev *pf_pdev;
 877
 878                 pdev = to_pci_dev(dev);
 879
 880 #ifdef CONFIG_X86
 881                 /* VMD child devices currently cannot be handled individually */
 882                 if (is_vmd(pdev->bus))
 883                         return NULL;
 884 #endif
 885
 886                 /* VFs aren't listed in scope tables; we need to look up
 887                  * the PF instead to find the IOMMU. */
 888                 pf_pdev = pci_physfn(pdev);
 889                 dev = &pf_pdev->dev;
 890                 segment = pci_domain_nr(pdev->bus);
 891         } else if (has_acpi_companion(dev))
 892                 dev = &ACPI_COMPANION(dev)->dev;
 893
 894         rcu_read_lock();
 895         for_each_active_iommu(iommu, drhd) {
 896                 if (pdev && segment != drhd->segment)
 897                         continue;
 898
 899                 for_each_active_dev_scope(drhd->devices,
 900                                           drhd->devices_cnt, i, tmp) {
 901                         if (tmp == dev) {
 902                                 /* For a VF use its original BDF# not that of the PF
 903                                  * which we used for the IOMMU lookup. Strictly speaking
 904                                  * we could do this for all PCI devices; we only need to
 905                                  * get the BDF# from the scope table for ACPI matches. */
 906                                 if (pdev && pdev->is_virtfn)
 907                                         goto got_pdev;
 908
 909                                 *bus = drhd->devices[i].bus;
 910                                 *devfn = drhd->devices[i].devfn;
 911                                 goto out;
 912                         }
 913
 914                         if (!pdev || !dev_is_pci(tmp))
 915                                 continue;
 916
 917                         ptmp = to_pci_dev(tmp);
 918                         if (ptmp->subordinate &&
 919                             ptmp->subordinate->number <= pdev->bus->number &&
 920                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
 921                                 goto got_pdev;
 922                 }
 923
 924                 if (pdev && drhd->include_all) {
 925                 got_pdev:
 926                         *bus = pdev->bus->number;
 927                         *devfn = pdev->devfn;
 928                         goto out;
 929                 }
 930         }
 931         iommu = NULL;
 932  out:
 933         rcu_read_unlock();
 934
 935         return iommu;
 936 }
 937
 938 static void domain_flush_cache(struct dmar_domain *domain,
 939                                void *addr, int size)
 940 {
 941         if (!domain->iommu_coherency)
 942                 clflush_cache_range(addr, size);
 943 }
 944
 945 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 946 {
 947         struct context_entry *context;
 948         int ret = 0;
 949         unsigned long flags;
 950
 951         spin_lock_irqsave(&iommu->lock, flags);
 952         context = iommu_context_addr(iommu, bus, devfn, 0);
 953         if (context)
 954                 ret = context_present(context);
 955         spin_unlock_irqrestore(&iommu->lock, flags);
 956         return ret;
 957 }
 958
 959 static void free_context_table(struct intel_iommu *iommu)
 960 {
 961         int i;
 962         unsigned long flags;
 963         struct context_entry *context;
 964
 965         spin_lock_irqsave(&iommu->lock, flags);
 966         if (!iommu->root_entry) {
 967                 goto out;
 968         }
 969         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 970                 context = iommu_context_addr(iommu, i, 0, 0);
 971                 if (context)
 972                         free_pgtable_page(context);
 973
 974                 if (!ecs_enabled(iommu))
 975                         continue;
 976
 977                 context = iommu_context_addr(iommu, i, 0x80, 0);
 978                 if (context)
 979                         free_pgtable_page(context);
 980
 981         }
 982         free_pgtable_page(iommu->root_entry);
 983         iommu->root_entry = NULL;
 984 out:
 985         spin_unlock_irqrestore(&iommu->lock, flags);
 986 }
 987
 988 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 989                                       unsigned long pfn, int *target_level)
 990 {
 991         struct dma_pte *parent, *pte = NULL;
 992         int level = agaw_to_level(domain->agaw);
 993         int offset;
 994
 995         BUG_ON(!domain->pgd);
 996
 997         if (!domain_pfn_supported(domain, pfn))
 998                 /* Address beyond IOMMU's addressing capabilities. */
 999                 return NULL;
1000
1001         parent = domain->pgd;
1002
1003         while (1) {
1004                 void *tmp_page;
1005
1006                 offset = pfn_level_offset(pfn, level);
1007                 pte = &parent[offset];
1008                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1009                         break;
1010                 if (level == *target_level)
1011                         break;
1012
1013                 if (!dma_pte_present(pte)) {
1014                         uint64_t pteval;
1015
1016                         tmp_page = alloc_pgtable_page(domain->nid);
1017
1018                         if (!tmp_page)
1019                                 return NULL;
1020
1021                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1022                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1023                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1024                                 /* Someone else set it while we were thinking; use theirs. */
1025                                 free_pgtable_page(tmp_page);
1026                         else
1027                                 domain_flush_cache(domain, pte, sizeof(*pte));
1028                 }
1029                 if (level == 1)
1030                         break;
1031
1032                 parent = phys_to_virt(dma_pte_addr(pte));
1033                 level--;
1034         }
1035
1036         if (!*target_level)
1037                 *target_level = level;
1038
1039         return pte;
1040 }
1041
1042
1043 /* return address's pte at specific level */
1044 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1045                                          unsigned long pfn,
1046                                          int level, int *large_page)
1047 {
1048         struct dma_pte *parent, *pte = NULL;
1049         int total = agaw_to_level(domain->agaw);
1050         int offset;
1051
1052         parent = domain->pgd;
1053         while (level <= total) {
1054                 offset = pfn_level_offset(pfn, total);
1055                 pte = &parent[offset];
1056                 if (level == total)
1057                         return pte;
1058
1059                 if (!dma_pte_present(pte)) {
1060                         *large_page = total;
1061                         break;
1062                 }
1063
1064                 if (dma_pte_superpage(pte)) {
1065                         *large_page = total;
1066                         return pte;
1067                 }
1068
1069                 parent = phys_to_virt(dma_pte_addr(pte));
1070                 total--;
1071         }
1072         return NULL;
1073 }
1074
1075 /* clear last level pte, a tlb flush should be followed */
1076 static void dma_pte_clear_range(struct dmar_domain *domain,
1077                                 unsigned long start_pfn,
1078                                 unsigned long last_pfn)
1079 {
1080         unsigned int large_page = 1;
1081         struct dma_pte *first_pte, *pte;
1082
1083         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1084         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1085         BUG_ON(start_pfn > last_pfn);
1086
1087         /* we don't need lock here; nobody else touches the iova range */
1088         do {
1089                 large_page = 1;
1090                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1091                 if (!pte) {
1092                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1093                         continue;
1094                 }
1095                 do {
1096                         dma_clear_pte(pte);
1097                         start_pfn += lvl_to_nr_pages(large_page);
1098                         pte++;
1099                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1100
1101                 domain_flush_cache(domain, first_pte,
1102                                    (void *)pte - (void *)first_pte);
1103
1104         } while (start_pfn && start_pfn <= last_pfn);
1105 }
1106
1107 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1108                                int retain_level, struct dma_pte *pte,
1109                                unsigned long pfn, unsigned long start_pfn,
1110                                unsigned long last_pfn)
1111 {
1112         pfn = max(start_pfn, pfn);
1113         pte = &pte[pfn_level_offset(pfn, level)];
1114
1115         do {
1116                 unsigned long level_pfn;
1117                 struct dma_pte *level_pte;
1118
1119                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1120                         goto next;
1121
1122                 level_pfn = pfn & level_mask(level);
1123                 level_pte = phys_to_virt(dma_pte_addr(pte));
1124
1125                 if (level > 2) {
1126                         dma_pte_free_level(domain, level - 1, retain_level,
1127                                            level_pte, level_pfn, start_pfn,
1128                                            last_pfn);
1129                 }
1130
1131                 /*
1132                  * Free the page table if we're below the level we want to
1133                  * retain and the range covers the entire table.
1134                  */
1135                 if (level < retain_level && !(start_pfn > level_pfn ||
1136                       last_pfn < level_pfn + level_size(level) - 1)) {
1137                         dma_clear_pte(pte);
1138                         domain_flush_cache(domain, pte, sizeof(*pte));
1139                         free_pgtable_page(level_pte);
1140                 }
1141 next:
1142                 pfn += level_size(level);
1143         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1144 }
1145
1146 /*
1147  * clear last level (leaf) ptes and free page table pages below the
1148  * level we wish to keep intact.
1149  */
1150 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1151                                    unsigned long start_pfn,
1152                                    unsigned long last_pfn,
1153                                    int retain_level)
1154 {
1155         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1156         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1157         BUG_ON(start_pfn > last_pfn);
1158
1159         dma_pte_clear_range(domain, start_pfn, last_pfn);
1160
1161         /* We don't need lock here; nobody else touches the iova range */
1162         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1163                            domain->pgd, 0, start_pfn, last_pfn);
1164
1165         /* free pgd */
1166         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1167                 free_pgtable_page(domain->pgd);
1168                 domain->pgd = NULL;
1169         }
1170 }
1171
1172 /* When a page at a given level is being unlinked from its parent, we don't
1173    need to *modify* it at all. All we need to do is make a list of all the
1174    pages which can be freed just as soon as we've flushed the IOTLB and we
1175    know the hardware page-walk will no longer touch them.
1176    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1177    be freed. */
1178 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1179                                             int level, struct dma_pte *pte,
1180                                             struct page *freelist)
1181 {
1182         struct page *pg;
1183
1184         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1185         pg->freelist = freelist;
1186         freelist = pg;
1187
1188         if (level == 1)
1189                 return freelist;
1190
1191         pte = page_address(pg);
1192         do {
1193                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1194                         freelist = dma_pte_list_pagetables(domain, level - 1,
1195                                                            pte, freelist);
1196                 pte++;
1197         } while (!first_pte_in_page(pte));
1198
1199         return freelist;
1200 }
1201
1202 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1203                                         struct dma_pte *pte, unsigned long pfn,
1204                                         unsigned long start_pfn,
1205                                         unsigned long last_pfn,
1206                                         struct page *freelist)
1207 {
1208         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1209
1210         pfn = max(start_pfn, pfn);
1211         pte = &pte[pfn_level_offset(pfn, level)];
1212
1213         do {
1214                 unsigned long level_pfn;
1215
1216                 if (!dma_pte_present(pte))
1217                         goto next;
1218
1219                 level_pfn = pfn & level_mask(level);
1220
1221                 /* If range covers entire pagetable, free it */
1222                 if (start_pfn <= level_pfn &&
1223                     last_pfn >= level_pfn + level_size(level) - 1) {
1224                         /* These suborbinate page tables are going away entirely. Don't
1225                            bother to clear them; we're just going to *free* them. */
1226                         if (level > 1 && !dma_pte_superpage(pte))
1227                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1228
1229                         dma_clear_pte(pte);
1230                         if (!first_pte)
1231                                 first_pte = pte;
1232                         last_pte = pte;
1233                 } else if (level > 1) {
1234                         /* Recurse down into a level that isn't *entirely* obsolete */
1235                         freelist = dma_pte_clear_level(domain, level - 1,
1236                                                        phys_to_virt(dma_pte_addr(pte)),
1237                                                        level_pfn, start_pfn, last_pfn,
1238                                                        freelist);
1239                 }
1240 next:
1241                 pfn += level_size(level);
1242         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1243
1244         if (first_pte)
1245                 domain_flush_cache(domain, first_pte,
1246                                    (void *)++last_pte - (void *)first_pte);
1247
1248         return freelist;
1249 }
1250
1251 /* We can't just free the pages because the IOMMU may still be walking
1252    the page tables, and may have cached the intermediate levels. The
1253    pages can only be freed after the IOTLB flush has been done. */
1254 static struct page *domain_unmap(struct dmar_domain *domain,
1255                                  unsigned long start_pfn,
1256                                  unsigned long last_pfn)
1257 {
1258         struct page *freelist = NULL;
1259
1260         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1261         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1262         BUG_ON(start_pfn > last_pfn);
1263
1264         /* we don't need lock here; nobody else touches the iova range */
1265         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1266                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1267
1268         /* free pgd */
1269         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1270                 struct page *pgd_page = virt_to_page(domain->pgd);
1271                 pgd_page->freelist = freelist;
1272                 freelist = pgd_page;
1273
1274                 domain->pgd = NULL;
1275         }
1276
1277         return freelist;
1278 }
1279
1280 static void dma_free_pagelist(struct page *freelist)
1281 {
1282         struct page *pg;
1283
1284         while ((pg = freelist)) {
1285                 freelist = pg->freelist;
1286                 free_pgtable_page(page_address(pg));
1287         }
1288 }
1289
1290 static void iova_entry_free(unsigned long data)
1291 {
1292         struct page *freelist = (struct page *)data;
1293
1294         dma_free_pagelist(freelist);
1295 }
1296
1297 /* iommu handling */
1298 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1299 {
1300         struct root_entry *root;
1301         unsigned long flags;
1302
1303         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1304         if (!root) {
1305                 pr_err("Allocating root entry for %s failed\n",
1306                         iommu->name);
1307                 return -ENOMEM;
1308         }
1309
1310         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1311
1312         spin_lock_irqsave(&iommu->lock, flags);
1313         iommu->root_entry = root;
1314         spin_unlock_irqrestore(&iommu->lock, flags);
1315
1316         return 0;
1317 }
1318
1319 static void iommu_set_root_entry(struct intel_iommu *iommu)
1320 {
1321         u64 addr;
1322         u32 sts;
1323         unsigned long flag;
1324
1325         addr = virt_to_phys(iommu->root_entry);
1326         if (ecs_enabled(iommu))
1327                 addr |= DMA_RTADDR_RTT;
1328
1329         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1330         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1331
1332         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1333
1334         /* Make sure hardware complete it */
1335         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1336                       readl, (sts & DMA_GSTS_RTPS), sts);
1337
1338         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1339 }
1340
1341 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1342 {
1343         u32 val;
1344         unsigned long flag;
1345
1346         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1347                 return;
1348
1349         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1350         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1351
1352         /* Make sure hardware complete it */
1353         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1354                       readl, (!(val & DMA_GSTS_WBFS)), val);
1355
1356         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1357 }
1358
1359 /* return value determine if we need a write buffer flush */
1360 static void __iommu_flush_context(struct intel_iommu *iommu,
1361                                   u16 did, u16 source_id, u8 function_mask,
1362                                   u64 type)
1363 {
1364         u64 val = 0;
1365         unsigned long flag;
1366
1367         switch (type) {
1368         case DMA_CCMD_GLOBAL_INVL:
1369                 val = DMA_CCMD_GLOBAL_INVL;
1370                 break;
1371         case DMA_CCMD_DOMAIN_INVL:
1372                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1373                 break;
1374         case DMA_CCMD_DEVICE_INVL:
1375                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1376                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1377                 break;
1378         default:
1379                 BUG();
1380         }
1381         val |= DMA_CCMD_ICC;
1382
1383         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1384         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1385
1386         /* Make sure hardware complete it */
1387         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1388                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1389
1390         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1391 }
1392
1393 /* return value determine if we need a write buffer flush */
1394 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1395                                 u64 addr, unsigned int size_order, u64 type)
1396 {
1397         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1398         u64 val = 0, val_iva = 0;
1399         unsigned long flag;
1400
1401         switch (type) {
1402         case DMA_TLB_GLOBAL_FLUSH:
1403                 /* global flush doesn't need set IVA_REG */
1404                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1405                 break;
1406         case DMA_TLB_DSI_FLUSH:
1407                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1408                 break;
1409         case DMA_TLB_PSI_FLUSH:
1410                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1411                 /* IH bit is passed in as part of address */
1412                 val_iva = size_order | addr;
1413                 break;
1414         default:
1415                 BUG();
1416         }
1417         /* Note: set drain read/write */
1418 #if 0
1419         /*
1420          * This is probably to be super secure.. Looks like we can
1421          * ignore it without any impact.
1422          */
1423         if (cap_read_drain(iommu->cap))
1424                 val |= DMA_TLB_READ_DRAIN;
1425 #endif
1426         if (cap_write_drain(iommu->cap))
1427                 val |= DMA_TLB_WRITE_DRAIN;
1428
1429         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1430         /* Note: Only uses first TLB reg currently */
1431         if (val_iva)
1432                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1433         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1434
1435         /* Make sure hardware complete it */
1436         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1437                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1438
1439         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1440
1441         /* check IOTLB invalidation granularity */
1442         if (DMA_TLB_IAIG(val) == 0)
1443                 pr_err("Flush IOTLB failed\n");
1444         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1445                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1446                         (unsigned long long)DMA_TLB_IIRG(type),
1447                         (unsigned long long)DMA_TLB_IAIG(val));
1448 }
1449
1450 static struct device_domain_info *
1451 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1452                          u8 bus, u8 devfn)
1453 {
1454         struct device_domain_info *info;
1455
1456         assert_spin_locked(&device_domain_lock);
1457
1458         if (!iommu->qi)
1459                 return NULL;
1460
1461         list_for_each_entry(info, &domain->devices, link)
1462                 if (info->iommu == iommu && info->bus == bus &&
1463                     info->devfn == devfn) {
1464                         if (info->ats_supported && info->dev)
1465                                 return info;
1466                         break;
1467                 }
1468
1469         return NULL;
1470 }
1471
1472 static void domain_update_iotlb(struct dmar_domain *domain)
1473 {
1474         struct device_domain_info *info;
1475         bool has_iotlb_device = false;
1476
1477         assert_spin_locked(&device_domain_lock);
1478
1479         list_for_each_entry(info, &domain->devices, link) {
1480                 struct pci_dev *pdev;
1481
1482                 if (!info->dev || !dev_is_pci(info->dev))
1483                         continue;
1484
1485                 pdev = to_pci_dev(info->dev);
1486                 if (pdev->ats_enabled) {
1487                         has_iotlb_device = true;
1488                         break;
1489                 }
1490         }
1491
1492         domain->has_iotlb_device = has_iotlb_device;
1493 }
1494
1495 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1496 {
1497         struct pci_dev *pdev;
1498
1499         assert_spin_locked(&device_domain_lock);
1500
1501         if (!info || !dev_is_pci(info->dev))
1502                 return;
1503
1504         pdev = to_pci_dev(info->dev);
1505         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1506          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1507          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1508          * reserved, which should be set to 0.
1509          */
1510         if (!ecap_dit(info->iommu->ecap))
1511                 info->pfsid = 0;
1512         else {
1513                 struct pci_dev *pf_pdev;
1514
1515                 /* pdev will be returned if device is not a vf */
1516                 pf_pdev = pci_physfn(pdev);
1517                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1518         }
1519
1520 #ifdef CONFIG_INTEL_IOMMU_SVM
1521         /* The PCIe spec, in its wisdom, declares that the behaviour of
1522            the device if you enable PASID support after ATS support is
1523            undefined. So always enable PASID support on devices which
1524            have it, even if we can't yet know if we're ever going to
1525            use it. */
1526         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1527                 info->pasid_enabled = 1;
1528
1529         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1530                 info->pri_enabled = 1;
1531 #endif
1532         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1533                 info->ats_enabled = 1;
1534                 domain_update_iotlb(info->domain);
1535                 info->ats_qdep = pci_ats_queue_depth(pdev);
1536         }
1537 }
1538
1539 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1540 {
1541         struct pci_dev *pdev;
1542
1543         assert_spin_locked(&device_domain_lock);
1544
1545         if (!dev_is_pci(info->dev))
1546                 return;
1547
1548         pdev = to_pci_dev(info->dev);
1549
1550         if (info->ats_enabled) {
1551                 pci_disable_ats(pdev);
1552                 info->ats_enabled = 0;
1553                 domain_update_iotlb(info->domain);
1554         }
1555 #ifdef CONFIG_INTEL_IOMMU_SVM
1556         if (info->pri_enabled) {
1557                 pci_disable_pri(pdev);
1558                 info->pri_enabled = 0;
1559         }
1560         if (info->pasid_enabled) {
1561                 pci_disable_pasid(pdev);
1562                 info->pasid_enabled = 0;
1563         }
1564 #endif
1565 }
1566
1567 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1568                                   u64 addr, unsigned mask)
1569 {
1570         u16 sid, qdep;
1571         unsigned long flags;
1572         struct device_domain_info *info;
1573
1574         if (!domain->has_iotlb_device)
1575                 return;
1576
1577         spin_lock_irqsave(&device_domain_lock, flags);
1578         list_for_each_entry(info, &domain->devices, link) {
1579                 if (!info->ats_enabled)
1580                         continue;
1581
1582                 sid = info->bus << 8 | info->devfn;
1583                 qdep = info->ats_qdep;
1584                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1585                                 qdep, addr, mask);
1586         }
1587         spin_unlock_irqrestore(&device_domain_lock, flags);
1588 }
1589
1590 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1591                                   struct dmar_domain *domain,
1592                                   unsigned long pfn, unsigned int pages,
1593                                   int ih, int map)
1594 {
1595         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1596         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1597         u16 did = domain->iommu_did[iommu->seq_id];
1598
1599         BUG_ON(pages == 0);
1600
1601         if (ih)
1602                 ih = 1 << 6;
1603         /*
1604          * Fallback to domain selective flush if no PSI support or the size is
1605          * too big.
1606          * PSI requires page size to be 2 ^ x, and the base address is naturally
1607          * aligned to the size
1608          */
1609         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1610                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1611                                                 DMA_TLB_DSI_FLUSH);
1612         else
1613                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1614                                                 DMA_TLB_PSI_FLUSH);
1615
1616         /*
1617          * In caching mode, changes of pages from non-present to present require
1618          * flush. However, device IOTLB doesn't need to be flushed in this case.
1619          */
1620         if (!cap_caching_mode(iommu->cap) || !map)
1621                 iommu_flush_dev_iotlb(domain, addr, mask);
1622 }
1623
1624 static void iommu_flush_iova(struct iova_domain *iovad)
1625 {
1626         struct dmar_domain *domain;
1627         int idx;
1628
1629         domain = container_of(iovad, struct dmar_domain, iovad);
1630
1631         for_each_domain_iommu(idx, domain) {
1632                 struct intel_iommu *iommu = g_iommus[idx];
1633                 u16 did = domain->iommu_did[iommu->seq_id];
1634
1635                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1636
1637                 if (!cap_caching_mode(iommu->cap))
1638                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1639                                               0, MAX_AGAW_PFN_WIDTH);
1640         }
1641 }
1642
1643 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1644 {
1645         u32 pmen;
1646         unsigned long flags;
1647
1648         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1649                 return;
1650
1651         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1652         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1653         pmen &= ~DMA_PMEN_EPM;
1654         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1655
1656         /* wait for the protected region status bit to clear */
1657         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1658                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1659
1660         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1661 }
1662
1663 static void iommu_enable_translation(struct intel_iommu *iommu)
1664 {
1665         u32 sts;
1666         unsigned long flags;
1667
1668         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1669         iommu->gcmd |= DMA_GCMD_TE;
1670         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1671
1672         /* Make sure hardware complete it */
1673         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1674                       readl, (sts & DMA_GSTS_TES), sts);
1675
1676         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1677 }
1678
1679 static void iommu_disable_translation(struct intel_iommu *iommu)
1680 {
1681         u32 sts;
1682         unsigned long flag;
1683
1684         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1685         iommu->gcmd &= ~DMA_GCMD_TE;
1686         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1687
1688         /* Make sure hardware complete it */
1689         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1690                       readl, (!(sts & DMA_GSTS_TES)), sts);
1691
1692         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1693 }
1694
1695
1696 static int iommu_init_domains(struct intel_iommu *iommu)
1697 {
1698         u32 ndomains, nlongs;
1699         size_t size;
1700
1701         ndomains = cap_ndoms(iommu->cap);
1702         pr_debug("%s: Number of Domains supported <%d>\n",
1703                  iommu->name, ndomains);
1704         nlongs = BITS_TO_LONGS(ndomains);
1705
1706         spin_lock_init(&iommu->lock);
1707
1708         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1709         if (!iommu->domain_ids) {
1710                 pr_err("%s: Allocating domain id array failed\n",
1711                        iommu->name);
1712                 return -ENOMEM;
1713         }
1714
1715         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1716         iommu->domains = kzalloc(size, GFP_KERNEL);
1717
1718         if (iommu->domains) {
1719                 size = 256 * sizeof(struct dmar_domain *);
1720                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1721         }
1722
1723         if (!iommu->domains || !iommu->domains[0]) {
1724                 pr_err("%s: Allocating domain array failed\n",
1725                        iommu->name);
1726                 kfree(iommu->domain_ids);
1727                 kfree(iommu->domains);
1728                 iommu->domain_ids = NULL;
1729                 iommu->domains    = NULL;
1730                 return -ENOMEM;
1731         }
1732
1733
1734
1735         /*
1736          * If Caching mode is set, then invalid translations are tagged
1737          * with domain-id 0, hence we need to pre-allocate it. We also
1738          * use domain-id 0 as a marker for non-allocated domain-id, so
1739          * make sure it is not used for a real domain.
1740          */
1741         set_bit(0, iommu->domain_ids);
1742
1743         return 0;
1744 }
1745
1746 static void disable_dmar_iommu(struct intel_iommu *iommu)
1747 {
1748         struct device_domain_info *info, *tmp;
1749         unsigned long flags;
1750
1751         if (!iommu->domains || !iommu->domain_ids)
1752                 return;
1753
1754 again:
1755         spin_lock_irqsave(&device_domain_lock, flags);
1756         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1757                 struct dmar_domain *domain;
1758
1759                 if (info->iommu != iommu)
1760                         continue;
1761
1762                 if (!info->dev || !info->domain)
1763                         continue;
1764
1765                 domain = info->domain;
1766
1767                 __dmar_remove_one_dev_info(info);
1768
1769                 if (!domain_type_is_vm_or_si(domain)) {
1770                         /*
1771                          * The domain_exit() function  can't be called under
1772                          * device_domain_lock, as it takes this lock itself.
1773                          * So release the lock here and re-run the loop
1774                          * afterwards.
1775                          */
1776                         spin_unlock_irqrestore(&device_domain_lock, flags);
1777                         domain_exit(domain);
1778                         goto again;
1779                 }
1780         }
1781         spin_unlock_irqrestore(&device_domain_lock, flags);
1782
1783         if (iommu->gcmd & DMA_GCMD_TE)
1784                 iommu_disable_translation(iommu);
1785 }
1786
1787 static void free_dmar_iommu(struct intel_iommu *iommu)
1788 {
1789         if ((iommu->domains) && (iommu->domain_ids)) {
1790                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1791                 int i;
1792
1793                 for (i = 0; i < elems; i++)
1794                         kfree(iommu->domains[i]);
1795                 kfree(iommu->domains);
1796                 kfree(iommu->domain_ids);
1797                 iommu->domains = NULL;
1798                 iommu->domain_ids = NULL;
1799         }
1800
1801         g_iommus[iommu->seq_id] = NULL;
1802
1803         /* free context mapping */
1804         free_context_table(iommu);
1805
1806 #ifdef CONFIG_INTEL_IOMMU_SVM
1807         if (pasid_enabled(iommu)) {
1808                 if (ecap_prs(iommu->ecap))
1809                         intel_svm_finish_prq(iommu);
1810                 intel_svm_free_pasid_tables(iommu);
1811         }
1812 #endif
1813 }
1814
1815 static struct dmar_domain *alloc_domain(int flags)
1816 {
1817         struct dmar_domain *domain;
1818
1819         domain = alloc_domain_mem();
1820         if (!domain)
1821                 return NULL;
1822
1823         memset(domain, 0, sizeof(*domain));
1824         domain->nid = -1;
1825         domain->flags = flags;
1826         domain->has_iotlb_device = false;
1827         INIT_LIST_HEAD(&domain->devices);
1828
1829         return domain;
1830 }
1831
1832 /* Must be called with iommu->lock */
1833 static int domain_attach_iommu(struct dmar_domain *domain,
1834                                struct intel_iommu *iommu)
1835 {
1836         unsigned long ndomains;
1837         int num;
1838
1839         assert_spin_locked(&device_domain_lock);
1840         assert_spin_locked(&iommu->lock);
1841
1842         domain->iommu_refcnt[iommu->seq_id] += 1;
1843         domain->iommu_count += 1;
1844         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1845                 ndomains = cap_ndoms(iommu->cap);
1846                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1847
1848                 if (num >= ndomains) {
1849                         pr_err("%s: No free domain ids\n", iommu->name);
1850                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1851                         domain->iommu_count -= 1;
1852                         return -ENOSPC;
1853                 }
1854
1855                 set_bit(num, iommu->domain_ids);
1856                 set_iommu_domain(iommu, num, domain);
1857
1858                 domain->iommu_did[iommu->seq_id] = num;
1859                 domain->nid                      = iommu->node;
1860
1861                 domain_update_iommu_cap(domain);
1862         }
1863
1864         return 0;
1865 }
1866
1867 static int domain_detach_iommu(struct dmar_domain *domain,
1868                                struct intel_iommu *iommu)
1869 {
1870         int num, count = INT_MAX;
1871
1872         assert_spin_locked(&device_domain_lock);
1873         assert_spin_locked(&iommu->lock);
1874
1875         domain->iommu_refcnt[iommu->seq_id] -= 1;
1876         count = --domain->iommu_count;
1877         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1878                 num = domain->iommu_did[iommu->seq_id];
1879                 clear_bit(num, iommu->domain_ids);
1880                 set_iommu_domain(iommu, num, NULL);
1881
1882                 domain_update_iommu_cap(domain);
1883                 domain->iommu_did[iommu->seq_id] = 0;
1884         }
1885
1886         return count;
1887 }
1888
1889 static struct iova_domain reserved_iova_list;
1890 static struct lock_class_key reserved_rbtree_key;
1891
1892 static int dmar_init_reserved_ranges(void)
1893 {
1894         struct pci_dev *pdev = NULL;
1895         struct iova *iova;
1896         int i;
1897
1898         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1899                         DMA_32BIT_PFN);
1900
1901         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1902                 &reserved_rbtree_key);
1903
1904         /* IOAPIC ranges shouldn't be accessed by DMA */
1905         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1906                 IOVA_PFN(IOAPIC_RANGE_END));
1907         if (!iova) {
1908                 pr_err("Reserve IOAPIC range failed\n");
1909                 return -ENODEV;
1910         }
1911
1912         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1913         for_each_pci_dev(pdev) {
1914                 struct resource *r;
1915
1916                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1917                         r = &pdev->resource[i];
1918                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1919                                 continue;
1920                         iova = reserve_iova(&reserved_iova_list,
1921                                             IOVA_PFN(r->start),
1922                                             IOVA_PFN(r->end));
1923                         if (!iova) {
1924                                 pr_err("Reserve iova failed\n");
1925                                 return -ENODEV;
1926                         }
1927                 }
1928         }
1929         return 0;
1930 }
1931
1932 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1933 {
1934         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1935 }
1936
1937 static inline int guestwidth_to_adjustwidth(int gaw)
1938 {
1939         int agaw;
1940         int r = (gaw - 12) % 9;
1941
1942         if (r == 0)
1943                 agaw = gaw;
1944         else
1945                 agaw = gaw + 9 - r;
1946         if (agaw > 64)
1947                 agaw = 64;
1948         return agaw;
1949 }
1950
1951 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1952                        int guest_width)
1953 {
1954         int adjust_width, agaw;
1955         unsigned long sagaw;
1956         int err;
1957
1958         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1959                         DMA_32BIT_PFN);
1960
1961         err = init_iova_flush_queue(&domain->iovad,
1962                                     iommu_flush_iova, iova_entry_free);
1963         if (err)
1964                 return err;
1965
1966         domain_reserve_special_ranges(domain);
1967
1968         /* calculate AGAW */
1969         if (guest_width > cap_mgaw(iommu->cap))
1970                 guest_width = cap_mgaw(iommu->cap);
1971         domain->gaw = guest_width;
1972         adjust_width = guestwidth_to_adjustwidth(guest_width);
1973         agaw = width_to_agaw(adjust_width);
1974         sagaw = cap_sagaw(iommu->cap);
1975         if (!test_bit(agaw, &sagaw)) {
1976                 /* hardware doesn't support it, choose a bigger one */
1977                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1978                 agaw = find_next_bit(&sagaw, 5, agaw);
1979                 if (agaw >= 5)
1980                         return -ENODEV;
1981         }
1982         domain->agaw = agaw;
1983
1984         if (ecap_coherent(iommu->ecap))
1985                 domain->iommu_coherency = 1;
1986         else
1987                 domain->iommu_coherency = 0;
1988
1989         if (ecap_sc_support(iommu->ecap))
1990                 domain->iommu_snooping = 1;
1991         else
1992                 domain->iommu_snooping = 0;
1993
1994         if (intel_iommu_superpage)
1995                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1996         else
1997                 domain->iommu_superpage = 0;
1998
1999         domain->nid = iommu->node;
2000
2001         /* always allocate the top pgd */
2002         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
2003         if (!domain->pgd)
2004                 return -ENOMEM;
2005         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
2006         return 0;
2007 }
2008
2009 static void domain_exit(struct dmar_domain *domain)
2010 {
2011         struct page *freelist = NULL;
2012
2013         /* Domain 0 is reserved, so dont process it */
2014         if (!domain)
2015                 return;
2016
2017         /* Remove associated devices and clear attached or cached domains */
2018         rcu_read_lock();
2019         domain_remove_dev_info(domain);
2020         rcu_read_unlock();
2021
2022         /* destroy iovas */
2023         put_iova_domain(&domain->iovad);
2024
2025         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2026
2027         dma_free_pagelist(freelist);
2028
2029         free_domain_mem(domain);
2030 }
2031
2032 static int domain_context_mapping_one(struct dmar_domain *domain,
2033                                       struct intel_iommu *iommu,
2034                                       u8 bus, u8 devfn)
2035 {
2036         u16 did = domain->iommu_did[iommu->seq_id];
2037         int translation = CONTEXT_TT_MULTI_LEVEL;
2038         struct device_domain_info *info = NULL;
2039         struct context_entry *context;
2040         unsigned long flags;
2041         struct dma_pte *pgd;
2042         int ret, agaw;
2043
2044         WARN_ON(did == 0);
2045
2046         if (hw_pass_through && domain_type_is_si(domain))
2047                 translation = CONTEXT_TT_PASS_THROUGH;
2048
2049         pr_debug("Set context mapping for %02x:%02x.%d\n",
2050                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2051
2052         BUG_ON(!domain->pgd);
2053
2054         spin_lock_irqsave(&device_domain_lock, flags);
2055         spin_lock(&iommu->lock);
2056
2057         ret = -ENOMEM;
2058         context = iommu_context_addr(iommu, bus, devfn, 1);
2059         if (!context)
2060                 goto out_unlock;
2061
2062         ret = 0;
2063         if (context_present(context))
2064                 goto out_unlock;
2065
2066         /*
2067          * For kdump cases, old valid entries may be cached due to the
2068          * in-flight DMA and copied pgtable, but there is no unmapping
2069          * behaviour for them, thus we need an explicit cache flush for
2070          * the newly-mapped device. For kdump, at this point, the device
2071          * is supposed to finish reset at its driver probe stage, so no
2072          * in-flight DMA will exist, and we don't need to worry anymore
2073          * hereafter.
2074          */
2075         if (context_copied(context)) {
2076                 u16 did_old = context_domain_id(context);
2077
2078                 if (did_old >= 0 && did_old < cap_ndoms(iommu->cap)) {
2079                         iommu->flush.flush_context(iommu, did_old,
2080                                                    (((u16)bus) << 8) | devfn,
2081                                                    DMA_CCMD_MASK_NOBIT,
2082                                                    DMA_CCMD_DEVICE_INVL);
2083                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2084                                                  DMA_TLB_DSI_FLUSH);
2085                 }
2086         }
2087
2088         pgd = domain->pgd;
2089
2090         context_clear_entry(context);
2091         context_set_domain_id(context, did);
2092
2093         /*
2094          * Skip top levels of page tables for iommu which has less agaw
2095          * than default.  Unnecessary for PT mode.
2096          */
2097         if (translation != CONTEXT_TT_PASS_THROUGH) {
2098                 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2099                         ret = -ENOMEM;
2100                         pgd = phys_to_virt(dma_pte_addr(pgd));
2101                         if (!dma_pte_present(pgd))
2102                                 goto out_unlock;
2103                 }
2104
2105                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2106                 if (info && info->ats_supported)
2107                         translation = CONTEXT_TT_DEV_IOTLB;
2108                 else
2109                         translation = CONTEXT_TT_MULTI_LEVEL;
2110
2111                 context_set_address_root(context, virt_to_phys(pgd));
2112                 context_set_address_width(context, agaw);
2113         } else {
2114                 /*
2115                  * In pass through mode, AW must be programmed to
2116                  * indicate the largest AGAW value supported by
2117                  * hardware. And ASR is ignored by hardware.
2118                  */
2119                 context_set_address_width(context, iommu->msagaw);
2120         }
2121
2122         context_set_translation_type(context, translation);
2123         context_set_fault_enable(context);
2124         context_set_present(context);
2125         domain_flush_cache(domain, context, sizeof(*context));
2126
2127         /*
2128          * It's a non-present to present mapping. If hardware doesn't cache
2129          * non-present entry we only need to flush the write-buffer. If the
2130          * _does_ cache non-present entries, then it does so in the special
2131          * domain #0, which we have to flush:
2132          */
2133         if (cap_caching_mode(iommu->cap)) {
2134                 iommu->flush.flush_context(iommu, 0,
2135                                            (((u16)bus) << 8) | devfn,
2136                                            DMA_CCMD_MASK_NOBIT,
2137                                            DMA_CCMD_DEVICE_INVL);
2138                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2139         } else {
2140                 iommu_flush_write_buffer(iommu);
2141         }
2142         iommu_enable_dev_iotlb(info);
2143
2144         ret = 0;
2145
2146 out_unlock:
2147         spin_unlock(&iommu->lock);
2148         spin_unlock_irqrestore(&device_domain_lock, flags);
2149
2150         return ret;
2151 }
2152
2153 struct domain_context_mapping_data {
2154         struct dmar_domain *domain;
2155         struct intel_iommu *iommu;
2156 };
2157
2158 static int domain_context_mapping_cb(struct pci_dev *pdev,
2159                                      u16 alias, void *opaque)
2160 {
2161         struct domain_context_mapping_data *data = opaque;
2162
2163         return domain_context_mapping_one(data->domain, data->iommu,
2164                                           PCI_BUS_NUM(alias), alias & 0xff);
2165 }
2166
2167 static int
2168 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2169 {
2170         struct intel_iommu *iommu;
2171         u8 bus, devfn;
2172         struct domain_context_mapping_data data;
2173
2174         iommu = device_to_iommu(dev, &bus, &devfn);
2175         if (!iommu)
2176                 return -ENODEV;
2177
2178         if (!dev_is_pci(dev))
2179                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2180
2181         data.domain = domain;
2182         data.iommu = iommu;
2183
2184         return pci_for_each_dma_alias(to_pci_dev(dev),
2185                                       &domain_context_mapping_cb, &data);
2186 }
2187
2188 static int domain_context_mapped_cb(struct pci_dev *pdev,
2189                                     u16 alias, void *opaque)
2190 {
2191         struct intel_iommu *iommu = opaque;
2192
2193         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2194 }
2195
2196 static int domain_context_mapped(struct device *dev)
2197 {
2198         struct intel_iommu *iommu;
2199         u8 bus, devfn;
2200
2201         iommu = device_to_iommu(dev, &bus, &devfn);
2202         if (!iommu)
2203                 return -ENODEV;
2204
2205         if (!dev_is_pci(dev))
2206                 return device_context_mapped(iommu, bus, devfn);
2207
2208         return !pci_for_each_dma_alias(to_pci_dev(dev),
2209                                        domain_context_mapped_cb, iommu);
2210 }
2211
2212 /* Returns a number of VTD pages, but aligned to MM page size */
2213 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2214                                             size_t size)
2215 {
2216         host_addr &= ~PAGE_MASK;
2217         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2218 }
2219
2220 /* Return largest possible superpage level for a given mapping */
2221 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2222                                           unsigned long iov_pfn,
2223                                           unsigned long phy_pfn,
2224                                           unsigned long pages)
2225 {
2226         int support, level = 1;
2227         unsigned long pfnmerge;
2228
2229         support = domain->iommu_superpage;
2230
2231         /* To use a large page, the virtual *and* physical addresses
2232            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2233            of them will mean we have to use smaller pages. So just
2234            merge them and check both at once. */
2235         pfnmerge = iov_pfn | phy_pfn;
2236
2237         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2238                 pages >>= VTD_STRIDE_SHIFT;
2239                 if (!pages)
2240                         break;
2241                 pfnmerge >>= VTD_STRIDE_SHIFT;
2242                 level++;
2243                 support--;
2244         }
2245         return level;
2246 }
2247
2248 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2249                             struct scatterlist *sg, unsigned long phys_pfn,
2250                             unsigned long nr_pages, int prot)
2251 {
2252         struct dma_pte *first_pte = NULL, *pte = NULL;
2253         phys_addr_t uninitialized_var(pteval);
2254         unsigned long sg_res = 0;
2255         unsigned int largepage_lvl = 0;
2256         unsigned long lvl_pages = 0;
2257
2258         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2259
2260         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2261                 return -EINVAL;
2262
2263         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2264
2265         if (!sg) {
2266                 sg_res = nr_pages;
2267                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2268         }
2269
2270         while (nr_pages > 0) {
2271                 uint64_t tmp;
2272
2273                 if (!sg_res) {
2274                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2275
2276                         sg_res = aligned_nrpages(sg->offset, sg->length);
2277                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2278                         sg->dma_length = sg->length;
2279                         pteval = (sg_phys(sg) - pgoff) | prot;
2280                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2281                 }
2282
2283                 if (!pte) {
2284                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2285
2286                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2287                         if (!pte)
2288                                 return -ENOMEM;
2289                         /* It is large page*/
2290                         if (largepage_lvl > 1) {
2291                                 unsigned long nr_superpages, end_pfn;
2292
2293                                 pteval |= DMA_PTE_LARGE_PAGE;
2294                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2295
2296                                 nr_superpages = sg_res / lvl_pages;
2297                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2298
2299                                 /*
2300                                  * Ensure that old small page tables are
2301                                  * removed to make room for superpage(s).
2302                                  * We're adding new large pages, so make sure
2303                                  * we don't remove their parent tables.
2304                                  */
2305                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2306                                                        largepage_lvl + 1);
2307                         } else {
2308                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2309                         }
2310
2311                 }
2312                 /* We don't need lock here, nobody else
2313                  * touches the iova range
2314                  */
2315                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2316                 if (tmp) {
2317                         static int dumps = 5;
2318                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2319                                 iov_pfn, tmp, (unsigned long long)pteval);
2320                         if (dumps) {
2321                                 dumps--;
2322                                 debug_dma_dump_mappings(NULL);
2323                         }
2324                         WARN_ON(1);
2325                 }
2326
2327                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2328
2329                 BUG_ON(nr_pages < lvl_pages);
2330                 BUG_ON(sg_res < lvl_pages);
2331
2332                 nr_pages -= lvl_pages;
2333                 iov_pfn += lvl_pages;
2334                 phys_pfn += lvl_pages;
2335                 pteval += lvl_pages * VTD_PAGE_SIZE;
2336                 sg_res -= lvl_pages;
2337
2338                 /* If the next PTE would be the first in a new page, then we
2339                    need to flush the cache on the entries we've just written.
2340                    And then we'll need to recalculate 'pte', so clear it and
2341                    let it get set again in the if (!pte) block above.
2342
2343                    If we're done (!nr_pages) we need to flush the cache too.
2344
2345                    Also if we've been setting superpages, we may need to
2346                    recalculate 'pte' and switch back to smaller pages for the
2347                    end of the mapping, if the trailing size is not enough to
2348                    use another superpage (i.e. sg_res < lvl_pages). */
2349                 pte++;
2350                 if (!nr_pages || first_pte_in_page(pte) ||
2351                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2352                         domain_flush_cache(domain, first_pte,
2353                                            (void *)pte - (void *)first_pte);
2354                         pte = NULL;
2355                 }
2356
2357                 if (!sg_res && nr_pages)
2358                         sg = sg_next(sg);
2359         }
2360         return 0;
2361 }
2362
2363 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2364                                     struct scatterlist *sg, unsigned long nr_pages,
2365                                     int prot)
2366 {
2367         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2368 }
2369
2370 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2371                                      unsigned long phys_pfn, unsigned long nr_pages,
2372                                      int prot)
2373 {
2374         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2375 }
2376
2377 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2378 {
2379         unsigned long flags;
2380         struct context_entry *context;
2381         u16 did_old;
2382
2383         if (!iommu)
2384                 return;
2385
2386         spin_lock_irqsave(&iommu->lock, flags);
2387         context = iommu_context_addr(iommu, bus, devfn, 0);
2388         if (!context) {
2389                 spin_unlock_irqrestore(&iommu->lock, flags);
2390                 return;
2391         }
2392         did_old = context_domain_id(context);
2393         context_clear_entry(context);
2394         __iommu_flush_cache(iommu, context, sizeof(*context));
2395         spin_unlock_irqrestore(&iommu->lock, flags);
2396         iommu->flush.flush_context(iommu,
2397                                    did_old,
2398                                    (((u16)bus) << 8) | devfn,
2399                                    DMA_CCMD_MASK_NOBIT,
2400                                    DMA_CCMD_DEVICE_INVL);
2401         iommu->flush.flush_iotlb(iommu,
2402                                  did_old,
2403                                  0,
2404                                  0,
2405                                  DMA_TLB_DSI_FLUSH);
2406 }
2407
2408 static inline void unlink_domain_info(struct device_domain_info *info)
2409 {
2410         assert_spin_locked(&device_domain_lock);
2411         list_del(&info->link);
2412         list_del(&info->global);
2413         if (info->dev)
2414                 info->dev->archdata.iommu = NULL;
2415 }
2416
2417 static void domain_remove_dev_info(struct dmar_domain *domain)
2418 {
2419         struct device_domain_info *info, *tmp;
2420         unsigned long flags;
2421
2422         spin_lock_irqsave(&device_domain_lock, flags);
2423         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2424                 __dmar_remove_one_dev_info(info);
2425         spin_unlock_irqrestore(&device_domain_lock, flags);
2426 }
2427
2428 /*
2429  * find_domain
2430  * Note: we use struct device->archdata.iommu stores the info
2431  */
2432 static struct dmar_domain *find_domain(struct device *dev)
2433 {
2434         struct device_domain_info *info;
2435
2436         /* No lock here, assumes no domain exit in normal case */
2437         info = dev->archdata.iommu;
2438         if (likely(info))
2439                 return info->domain;
2440         return NULL;
2441 }
2442
2443 static inline struct device_domain_info *
2444 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2445 {
2446         struct device_domain_info *info;
2447
2448         list_for_each_entry(info, &device_domain_list, global)
2449                 if (info->iommu->segment == segment && info->bus == bus &&
2450                     info->devfn == devfn)
2451                         return info;
2452
2453         return NULL;
2454 }
2455
2456 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2457                                                     int bus, int devfn,
2458                                                     struct device *dev,
2459                                                     struct dmar_domain *domain)
2460 {
2461         struct dmar_domain *found = NULL;
2462         struct device_domain_info *info;
2463         unsigned long flags;
2464         int ret;
2465
2466         info = alloc_devinfo_mem();
2467         if (!info)
2468                 return NULL;
2469
2470         info->bus = bus;
2471         info->devfn = devfn;
2472         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2473         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2474         info->ats_qdep = 0;
2475         info->dev = dev;
2476         info->domain = domain;
2477         info->iommu = iommu;
2478
2479         if (dev && dev_is_pci(dev)) {
2480                 struct pci_dev *pdev = to_pci_dev(info->dev);
2481
2482                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2483                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2484                     dmar_find_matched_atsr_unit(pdev))
2485                         info->ats_supported = 1;
2486
2487                 if (ecs_enabled(iommu)) {
2488                         if (pasid_enabled(iommu)) {
2489                                 int features = pci_pasid_features(pdev);
2490                                 if (features >= 0)
2491                                         info->pasid_supported = features | 1;
2492                         }
2493
2494                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2495                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2496                                 info->pri_supported = 1;
2497                 }
2498         }
2499
2500         spin_lock_irqsave(&device_domain_lock, flags);
2501         if (dev)
2502                 found = find_domain(dev);
2503
2504         if (!found) {
2505                 struct device_domain_info *info2;
2506                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2507                 if (info2) {
2508                         found      = info2->domain;
2509                         info2->dev = dev;
2510                 }
2511         }
2512
2513         if (found) {
2514                 spin_unlock_irqrestore(&device_domain_lock, flags);
2515                 free_devinfo_mem(info);
2516                 /* Caller must free the original domain */
2517                 return found;
2518         }
2519
2520         spin_lock(&iommu->lock);
2521         ret = domain_attach_iommu(domain, iommu);
2522         spin_unlock(&iommu->lock);
2523
2524         if (ret) {
2525                 spin_unlock_irqrestore(&device_domain_lock, flags);
2526                 free_devinfo_mem(info);
2527                 return NULL;
2528         }
2529
2530         list_add(&info->link, &domain->devices);
2531         list_add(&info->global, &device_domain_list);
2532         if (dev)
2533                 dev->archdata.iommu = info;
2534         spin_unlock_irqrestore(&device_domain_lock, flags);
2535
2536         if (dev && domain_context_mapping(domain, dev)) {
2537                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2538                 dmar_remove_one_dev_info(domain, dev);
2539                 return NULL;
2540         }
2541
2542         return domain;
2543 }
2544
2545 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2546 {
2547         *(u16 *)opaque = alias;
2548         return 0;
2549 }
2550
2551 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2552 {
2553         struct device_domain_info *info = NULL;
2554         struct dmar_domain *domain = NULL;
2555         struct intel_iommu *iommu;
2556         u16 req_id, dma_alias;
2557         unsigned long flags;
2558         u8 bus, devfn;
2559
2560         iommu = device_to_iommu(dev, &bus, &devfn);
2561         if (!iommu)
2562                 return NULL;
2563
2564         req_id = ((u16)bus << 8) | devfn;
2565
2566         if (dev_is_pci(dev)) {
2567                 struct pci_dev *pdev = to_pci_dev(dev);
2568
2569                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2570
2571                 spin_lock_irqsave(&device_domain_lock, flags);
2572                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2573                                                       PCI_BUS_NUM(dma_alias),
2574                                                       dma_alias & 0xff);
2575                 if (info) {
2576                         iommu = info->iommu;
2577                         domain = info->domain;
2578                 }
2579                 spin_unlock_irqrestore(&device_domain_lock, flags);
2580
2581                 /* DMA alias already has a domain, use it */
2582                 if (info)
2583                         goto out;
2584         }
2585
2586         /* Allocate and initialize new domain for the device */
2587         domain = alloc_domain(0);
2588         if (!domain)
2589                 return NULL;
2590         if (domain_init(domain, iommu, gaw)) {
2591                 domain_exit(domain);
2592                 return NULL;
2593         }
2594
2595 out:
2596
2597         return domain;
2598 }
2599
2600 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2601                                               struct dmar_domain *domain)
2602 {
2603         struct intel_iommu *iommu;
2604         struct dmar_domain *tmp;
2605         u16 req_id, dma_alias;
2606         u8 bus, devfn;
2607
2608         iommu = device_to_iommu(dev, &bus, &devfn);
2609         if (!iommu)
2610                 return NULL;
2611
2612         req_id = ((u16)bus << 8) | devfn;
2613
2614         if (dev_is_pci(dev)) {
2615                 struct pci_dev *pdev = to_pci_dev(dev);
2616
2617                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2618
2619                 /* register PCI DMA alias device */
2620                 if (req_id != dma_alias) {
2621                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2622                                         dma_alias & 0xff, NULL, domain);
2623
2624                         if (!tmp || tmp != domain)
2625                                 return tmp;
2626                 }
2627         }
2628
2629         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2630         if (!tmp || tmp != domain)
2631                 return tmp;
2632
2633         return domain;
2634 }
2635
2636 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2637 {
2638         struct dmar_domain *domain, *tmp;
2639
2640         domain = find_domain(dev);
2641         if (domain)
2642                 goto out;
2643
2644         domain = find_or_alloc_domain(dev, gaw);
2645         if (!domain)
2646                 goto out;
2647
2648         tmp = set_domain_for_dev(dev, domain);
2649         if (!tmp || domain != tmp) {
2650                 domain_exit(domain);
2651                 domain = tmp;
2652         }
2653
2654 out:
2655
2656         return domain;
2657 }
2658
2659 static int iommu_domain_identity_map(struct dmar_domain *domain,
2660                                      unsigned long long start,
2661                                      unsigned long long end)
2662 {
2663         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2664         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2665
2666         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2667                           dma_to_mm_pfn(last_vpfn))) {
2668                 pr_err("Reserving iova failed\n");
2669                 return -ENOMEM;
2670         }
2671
2672         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2673         /*
2674          * RMRR range might have overlap with physical memory range,
2675          * clear it first
2676          */
2677         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2678
2679         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2680                                   last_vpfn - first_vpfn + 1,
2681                                   DMA_PTE_READ|DMA_PTE_WRITE);
2682 }
2683
2684 static int domain_prepare_identity_map(struct device *dev,
2685                                        struct dmar_domain *domain,
2686                                        unsigned long long start,
2687                                        unsigned long long end)
2688 {
2689         /* For _hardware_ passthrough, don't bother. But for software
2690            passthrough, we do it anyway -- it may indicate a memory
2691            range which is reserved in E820, so which didn't get set
2692            up to start with in si_domain */
2693         if (domain == si_domain && hw_pass_through) {
2694                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2695                         dev_name(dev), start, end);
2696                 return 0;
2697         }
2698
2699         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2700                 dev_name(dev), start, end);
2701
2702         if (end < start) {
2703                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2704                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2705                         dmi_get_system_info(DMI_BIOS_VENDOR),
2706                         dmi_get_system_info(DMI_BIOS_VERSION),
2707                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2708                 return -EIO;
2709         }
2710
2711         if (end >> agaw_to_width(domain->agaw)) {
2712                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2713                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2714                      agaw_to_width(domain->agaw),
2715                      dmi_get_system_info(DMI_BIOS_VENDOR),
2716                      dmi_get_system_info(DMI_BIOS_VERSION),
2717                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2718                 return -EIO;
2719         }
2720
2721         return iommu_domain_identity_map(domain, start, end);
2722 }
2723
2724 static int iommu_prepare_identity_map(struct device *dev,
2725                                       unsigned long long start,
2726                                       unsigned long long end)
2727 {
2728         struct dmar_domain *domain;
2729         int ret;
2730
2731         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2732         if (!domain)
2733                 return -ENOMEM;
2734
2735         ret = domain_prepare_identity_map(dev, domain, start, end);
2736         if (ret)
2737                 domain_exit(domain);
2738
2739         return ret;
2740 }
2741
2742 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2743                                          struct device *dev)
2744 {
2745         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2746                 return 0;
2747         return iommu_prepare_identity_map(dev, rmrr->base_address,
2748                                           rmrr->end_address);
2749 }
2750
2751 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2752 static inline void iommu_prepare_isa(void)
2753 {
2754         struct pci_dev *pdev;
2755         int ret;
2756
2757         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2758         if (!pdev)
2759                 return;
2760
2761         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2762         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2763
2764         if (ret)
2765                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2766
2767         pci_dev_put(pdev);
2768 }
2769 #else
2770 static inline void iommu_prepare_isa(void)
2771 {
2772         return;
2773 }
2774 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2775
2776 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2777
2778 static int __init si_domain_init(int hw)
2779 {
2780         int nid, ret = 0;
2781
2782         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2783         if (!si_domain)
2784                 return -EFAULT;
2785
2786         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2787                 domain_exit(si_domain);
2788                 return -EFAULT;
2789         }
2790
2791         pr_debug("Identity mapping domain allocated\n");
2792
2793         if (hw)
2794                 return 0;
2795
2796         for_each_online_node(nid) {
2797                 unsigned long start_pfn, end_pfn;
2798                 int i;
2799
2800                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2801                         ret = iommu_domain_identity_map(si_domain,
2802                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2803                         if (ret)
2804                                 return ret;
2805                 }
2806         }
2807
2808         return 0;
2809 }
2810
2811 static int identity_mapping(struct device *dev)
2812 {
2813         struct device_domain_info *info;
2814
2815         if (likely(!iommu_identity_mapping))
2816                 return 0;
2817
2818         info = dev->archdata.iommu;
2819         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2820                 return (info->domain == si_domain);
2821
2822         return 0;
2823 }
2824
2825 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2826 {
2827         struct dmar_domain *ndomain;
2828         struct intel_iommu *iommu;
2829         u8 bus, devfn;
2830
2831         iommu = device_to_iommu(dev, &bus, &devfn);
2832         if (!iommu)
2833                 return -ENODEV;
2834
2835         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2836         if (ndomain != domain)
2837                 return -EBUSY;
2838
2839         return 0;
2840 }
2841
2842 static bool device_has_rmrr(struct device *dev)
2843 {
2844         struct dmar_rmrr_unit *rmrr;
2845         struct device *tmp;
2846         int i;
2847
2848         rcu_read_lock();
2849         for_each_rmrr_units(rmrr) {
2850                 /*
2851                  * Return TRUE if this RMRR contains the device that
2852                  * is passed in.
2853                  */
2854                 for_each_active_dev_scope(rmrr->devices,
2855                                           rmrr->devices_cnt, i, tmp)
2856                         if (tmp == dev) {
2857                                 rcu_read_unlock();
2858                                 return true;
2859                         }
2860         }
2861         rcu_read_unlock();
2862         return false;
2863 }
2864
2865 /*
2866  * There are a couple cases where we need to restrict the functionality of
2867  * devices associated with RMRRs.  The first is when evaluating a device for
2868  * identity mapping because problems exist when devices are moved in and out
2869  * of domains and their respective RMRR information is lost.  This means that
2870  * a device with associated RMRRs will never be in a "passthrough" domain.
2871  * The second is use of the device through the IOMMU API.  This interface
2872  * expects to have full control of the IOVA space for the device.  We cannot
2873  * satisfy both the requirement that RMRR access is maintained and have an
2874  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2875  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2876  * We therefore prevent devices associated with an RMRR from participating in
2877  * the IOMMU API, which eliminates them from device assignment.
2878  *
2879  * In both cases we assume that PCI USB devices with RMRRs have them largely
2880  * for historical reasons and that the RMRR space is not actively used post
2881  * boot.  This exclusion may change if vendors begin to abuse it.
2882  *
2883  * The same exception is made for graphics devices, with the requirement that
2884  * any use of the RMRR regions will be torn down before assigning the device
2885  * to a guest.
2886  */
2887 static bool device_is_rmrr_locked(struct device *dev)
2888 {
2889         if (!device_has_rmrr(dev))
2890                 return false;
2891
2892         if (dev_is_pci(dev)) {
2893                 struct pci_dev *pdev = to_pci_dev(dev);
2894
2895                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2896                         return false;
2897         }
2898
2899         return true;
2900 }
2901
2902 static int iommu_should_identity_map(struct device *dev, int startup)
2903 {
2904
2905         if (dev_is_pci(dev)) {
2906                 struct pci_dev *pdev = to_pci_dev(dev);
2907
2908                 if (device_is_rmrr_locked(dev))
2909                         return 0;
2910
2911                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2912                         return 1;
2913
2914                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2915                         return 1;
2916
2917                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2918                         return 0;
2919
2920                 /*
2921                  * We want to start off with all devices in the 1:1 domain, and
2922                  * take them out later if we find they can't access all of memory.
2923                  *
2924                  * However, we can't do this for PCI devices behind bridges,
2925                  * because all PCI devices behind the same bridge will end up
2926                  * with the same source-id on their transactions.
2927                  *
2928                  * Practically speaking, we can't change things around for these
2929                  * devices at run-time, because we can't be sure there'll be no
2930                  * DMA transactions in flight for any of their siblings.
2931                  *
2932                  * So PCI devices (unless they're on the root bus) as well as
2933                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2934                  * the 1:1 domain, just in _case_ one of their siblings turns out
2935                  * not to be able to map all of memory.
2936                  */
2937                 if (!pci_is_pcie(pdev)) {
2938                         if (!pci_is_root_bus(pdev->bus))
2939                                 return 0;
2940                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2941                                 return 0;
2942                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2943                         return 0;
2944         } else {
2945                 if (device_has_rmrr(dev))
2946                         return 0;
2947         }
2948
2949         /*
2950          * At boot time, we don't yet know if devices will be 64-bit capable.
2951          * Assume that they will — if they turn out not to be, then we can
2952          * take them out of the 1:1 domain later.
2953          */
2954         if (!startup) {
2955                 /*
2956                  * If the device's dma_mask is less than the system's memory
2957                  * size then this is not a candidate for identity mapping.
2958                  */
2959                 u64 dma_mask = *dev->dma_mask;
2960
2961                 if (dev->coherent_dma_mask &&
2962                     dev->coherent_dma_mask < dma_mask)
2963                         dma_mask = dev->coherent_dma_mask;
2964
2965                 return dma_mask >= dma_get_required_mask(dev);
2966         }
2967
2968         return 1;
2969 }
2970
2971 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2972 {
2973         int ret;
2974
2975         if (!iommu_should_identity_map(dev, 1))
2976                 return 0;
2977
2978         ret = domain_add_dev_info(si_domain, dev);
2979         if (!ret)
2980                 pr_info("%s identity mapping for device %s\n",
2981                         hw ? "Hardware" : "Software", dev_name(dev));
2982         else if (ret == -ENODEV)
2983                 /* device not associated with an iommu */
2984                 ret = 0;
2985
2986         return ret;
2987 }
2988
2989
2990 static int __init iommu_prepare_static_identity_mapping(int hw)
2991 {
2992         struct pci_dev *pdev = NULL;
2993         struct dmar_drhd_unit *drhd;
2994         struct intel_iommu *iommu;
2995         struct device *dev;
2996         int i;
2997         int ret = 0;
2998
2999         for_each_pci_dev(pdev) {
3000                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3001                 if (ret)
3002                         return ret;
3003         }
3004
3005         for_each_active_iommu(iommu, drhd)
3006                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3007                         struct acpi_device_physical_node *pn;
3008                         struct acpi_device *adev;
3009
3010                         if (dev->bus != &acpi_bus_type)
3011                                 continue;
3012
3013                         adev= to_acpi_device(dev);
3014                         mutex_lock(&adev->physical_node_lock);
3015                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3016                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3017                                 if (ret)
3018                                         break;
3019                         }
3020                         mutex_unlock(&adev->physical_node_lock);
3021                         if (ret)
3022                                 return ret;
3023                 }
3024
3025         return 0;
3026 }
3027
3028 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3029 {
3030         /*
3031          * Start from the sane iommu hardware state.
3032          * If the queued invalidation is already initialized by us
3033          * (for example, while enabling interrupt-remapping) then
3034          * we got the things already rolling from a sane state.
3035          */
3036         if (!iommu->qi) {
3037                 /*
3038                  * Clear any previous faults.
3039                  */
3040                 dmar_fault(-1, iommu);
3041                 /*
3042                  * Disable queued invalidation if supported and already enabled
3043                  * before OS handover.
3044                  */
3045                 dmar_disable_qi(iommu);
3046         }
3047
3048         if (dmar_enable_qi(iommu)) {
3049                 /*
3050                  * Queued Invalidate not enabled, use Register Based Invalidate
3051                  */
3052                 iommu->flush.flush_context = __iommu_flush_context;
3053                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3054                 pr_info("%s: Using Register based invalidation\n",
3055                         iommu->name);
3056         } else {
3057                 iommu->flush.flush_context = qi_flush_context;
3058                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3059                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3060         }
3061 }
3062
3063 static int copy_context_table(struct intel_iommu *iommu,
3064                               struct root_entry *old_re,
3065                               struct context_entry **tbl,
3066                               int bus, bool ext)
3067 {
3068         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3069         struct context_entry *new_ce = NULL, ce;
3070         struct context_entry *old_ce = NULL;
3071         struct root_entry re;
3072         phys_addr_t old_ce_phys;
3073
3074         tbl_idx = ext ? bus * 2 : bus;
3075         memcpy(&re, old_re, sizeof(re));
3076
3077         for (devfn = 0; devfn < 256; devfn++) {
3078                 /* First calculate the correct index */
3079                 idx = (ext ? devfn * 2 : devfn) % 256;
3080
3081                 if (idx == 0) {
3082                         /* First save what we may have and clean up */
3083                         if (new_ce) {
3084                                 tbl[tbl_idx] = new_ce;
3085                                 __iommu_flush_cache(iommu, new_ce,
3086                                                     VTD_PAGE_SIZE);
3087                                 pos = 1;
3088                         }
3089
3090                         if (old_ce)
3091                                 memunmap(old_ce);
3092
3093                         ret = 0;
3094                         if (devfn < 0x80)
3095                                 old_ce_phys = root_entry_lctp(&re);
3096                         else
3097                                 old_ce_phys = root_entry_uctp(&re);
3098
3099                         if (!old_ce_phys) {
3100                                 if (ext && devfn == 0) {
3101                                         /* No LCTP, try UCTP */
3102                                         devfn = 0x7f;
3103                                         continue;
3104                                 } else {
3105                                         goto out;
3106                                 }
3107                         }
3108
3109                         ret = -ENOMEM;
3110                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3111                                         MEMREMAP_WB);
3112                         if (!old_ce)
3113                                 goto out;
3114
3115                         new_ce = alloc_pgtable_page(iommu->node);
3116                         if (!new_ce)
3117                                 goto out_unmap;
3118
3119                         ret = 0;
3120                 }
3121
3122                 /* Now copy the context entry */
3123                 memcpy(&ce, old_ce + idx, sizeof(ce));
3124
3125                 if (!__context_present(&ce))
3126                         continue;
3127
3128                 did = context_domain_id(&ce);
3129                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3130                         set_bit(did, iommu->domain_ids);
3131
3132                 /*
3133                  * We need a marker for copied context entries. This
3134                  * marker needs to work for the old format as well as
3135                  * for extended context entries.
3136                  *
3137                  * Bit 67 of the context entry is used. In the old
3138                  * format this bit is available to software, in the
3139                  * extended format it is the PGE bit, but PGE is ignored
3140                  * by HW if PASIDs are disabled (and thus still
3141                  * available).
3142                  *
3143                  * So disable PASIDs first and then mark the entry
3144                  * copied. This means that we don't copy PASID
3145                  * translations from the old kernel, but this is fine as
3146                  * faults there are not fatal.
3147                  */
3148                 context_clear_pasid_enable(&ce);
3149                 context_set_copied(&ce);
3150
3151                 new_ce[idx] = ce;
3152         }
3153
3154         tbl[tbl_idx + pos] = new_ce;
3155
3156         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3157
3158 out_unmap:
3159         memunmap(old_ce);
3160
3161 out:
3162         return ret;
3163 }
3164
3165 static int copy_translation_tables(struct intel_iommu *iommu)
3166 {
3167         struct context_entry **ctxt_tbls;
3168         struct root_entry *old_rt;
3169         phys_addr_t old_rt_phys;
3170         int ctxt_table_entries;
3171         unsigned long flags;
3172         u64 rtaddr_reg;
3173         int bus, ret;
3174         bool new_ext, ext;
3175
3176         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3177         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3178         new_ext    = !!ecap_ecs(iommu->ecap);
3179
3180         /*
3181          * The RTT bit can only be changed when translation is disabled,
3182          * but disabling translation means to open a window for data
3183          * corruption. So bail out and don't copy anything if we would
3184          * have to change the bit.
3185          */
3186         if (new_ext != ext)
3187                 return -EINVAL;
3188
3189         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3190         if (!old_rt_phys)
3191                 return -EINVAL;
3192
3193         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3194         if (!old_rt)
3195                 return -ENOMEM;
3196
3197         /* This is too big for the stack - allocate it from slab */
3198         ctxt_table_entries = ext ? 512 : 256;
3199         ret = -ENOMEM;
3200         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3201         if (!ctxt_tbls)
3202                 goto out_unmap;
3203
3204         for (bus = 0; bus < 256; bus++) {
3205                 ret = copy_context_table(iommu, &old_rt[bus],
3206                                          ctxt_tbls, bus, ext);
3207                 if (ret) {
3208                         pr_err("%s: Failed to copy context table for bus %d\n",
3209                                 iommu->name, bus);
3210                         continue;
3211                 }
3212         }
3213
3214         spin_lock_irqsave(&iommu->lock, flags);
3215
3216         /* Context tables are copied, now write them to the root_entry table */
3217         for (bus = 0; bus < 256; bus++) {
3218                 int idx = ext ? bus * 2 : bus;
3219                 u64 val;
3220
3221                 if (ctxt_tbls[idx]) {
3222                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3223                         iommu->root_entry[bus].lo = val;
3224                 }
3225
3226                 if (!ext || !ctxt_tbls[idx + 1])
3227                         continue;
3228
3229                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3230                 iommu->root_entry[bus].hi = val;
3231         }
3232
3233         spin_unlock_irqrestore(&iommu->lock, flags);
3234
3235         kfree(ctxt_tbls);
3236
3237         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3238
3239         ret = 0;
3240
3241 out_unmap:
3242         memunmap(old_rt);
3243
3244         return ret;
3245 }
3246
3247 static int __init init_dmars(void)
3248 {
3249         struct dmar_drhd_unit *drhd;
3250         struct dmar_rmrr_unit *rmrr;
3251         bool copied_tables = false;
3252         struct device *dev;
3253         struct intel_iommu *iommu;
3254         int i, ret;
3255
3256         /*
3257          * for each drhd
3258          *    allocate root
3259          *    initialize and program root entry to not present
3260          * endfor
3261          */
3262         for_each_drhd_unit(drhd) {
3263                 /*
3264                  * lock not needed as this is only incremented in the single
3265                  * threaded kernel __init code path all other access are read
3266                  * only
3267                  */
3268                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3269                         g_num_of_iommus++;
3270                         continue;
3271                 }
3272                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3273         }
3274
3275         /* Preallocate enough resources for IOMMU hot-addition */
3276         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3277                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3278
3279         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3280                         GFP_KERNEL);
3281         if (!g_iommus) {
3282                 pr_err("Allocating global iommu array failed\n");
3283                 ret = -ENOMEM;
3284                 goto error;
3285         }
3286
3287         for_each_active_iommu(iommu, drhd) {
3288                 g_iommus[iommu->seq_id] = iommu;
3289
3290                 intel_iommu_init_qi(iommu);
3291
3292                 ret = iommu_init_domains(iommu);
3293                 if (ret)
3294                         goto free_iommu;
3295
3296                 init_translation_status(iommu);
3297
3298                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3299                         iommu_disable_translation(iommu);
3300                         clear_translation_pre_enabled(iommu);
3301                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3302                                 iommu->name);
3303                 }
3304
3305                 /*
3306                  * TBD:
3307                  * we could share the same root & context tables
3308                  * among all IOMMU's. Need to Split it later.
3309                  */
3310                 ret = iommu_alloc_root_entry(iommu);
3311                 if (ret)
3312                         goto free_iommu;
3313
3314                 if (translation_pre_enabled(iommu)) {
3315                         pr_info("Translation already enabled - trying to copy translation structures\n");
3316
3317                         ret = copy_translation_tables(iommu);
3318                         if (ret) {
3319                                 /*
3320                                  * We found the IOMMU with translation
3321                                  * enabled - but failed to copy over the
3322                                  * old root-entry table. Try to proceed
3323                                  * by disabling translation now and
3324                                  * allocating a clean root-entry table.
3325                                  * This might cause DMAR faults, but
3326                                  * probably the dump will still succeed.
3327                                  */
3328                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3329                                        iommu->name);
3330                                 iommu_disable_translation(iommu);
3331                                 clear_translation_pre_enabled(iommu);
3332                         } else {
3333                                 pr_info("Copied translation tables from previous kernel for %s\n",
3334                                         iommu->name);
3335                                 copied_tables = true;
3336                         }
3337                 }
3338
3339                 if (!ecap_pass_through(iommu->ecap))
3340                         hw_pass_through = 0;
3341
3342                 if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
3343                         pr_info("Disable batched IOTLB flush due to virtualization");
3344                         intel_iommu_strict = 1;
3345                 }
3346
3347 #ifdef CONFIG_INTEL_IOMMU_SVM
3348                 if (pasid_enabled(iommu))
3349                         intel_svm_alloc_pasid_tables(iommu);
3350 #endif
3351         }
3352
3353         /*
3354          * Now that qi is enabled on all iommus, set the root entry and flush
3355          * caches. This is required on some Intel X58 chipsets, otherwise the
3356          * flush_context function will loop forever and the boot hangs.
3357          */
3358         for_each_active_iommu(iommu, drhd) {
3359                 iommu_flush_write_buffer(iommu);
3360                 iommu_set_root_entry(iommu);
3361                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3362                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3363         }
3364
3365         if (iommu_pass_through)
3366                 iommu_identity_mapping |= IDENTMAP_ALL;
3367
3368 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3369         dmar_map_gfx = 0;
3370 #endif
3371
3372         if (!dmar_map_gfx)
3373                 iommu_identity_mapping |= IDENTMAP_GFX;
3374
3375         check_tylersburg_isoch();
3376
3377         if (iommu_identity_mapping) {
3378                 ret = si_domain_init(hw_pass_through);
3379                 if (ret)
3380                         goto free_iommu;
3381         }
3382
3383
3384         /*
3385          * If we copied translations from a previous kernel in the kdump
3386          * case, we can not assign the devices to domains now, as that
3387          * would eliminate the old mappings. So skip this part and defer
3388          * the assignment to device driver initialization time.
3389          */
3390         if (copied_tables)
3391                 goto domains_done;
3392
3393         /*
3394          * If pass through is not set or not enabled, setup context entries for
3395          * identity mappings for rmrr, gfx, and isa and may fall back to static
3396          * identity mapping if iommu_identity_mapping is set.
3397          */
3398         if (iommu_identity_mapping) {
3399                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3400                 if (ret) {
3401                         pr_crit("Failed to setup IOMMU pass-through\n");
3402                         goto free_iommu;
3403                 }
3404         }
3405         /*
3406          * For each rmrr
3407          *   for each dev attached to rmrr
3408          *   do
3409          *     locate drhd for dev, alloc domain for dev
3410          *     allocate free domain
3411          *     allocate page table entries for rmrr
3412          *     if context not allocated for bus
3413          *           allocate and init context
3414          *           set present in root table for this bus
3415          *     init context with domain, translation etc
3416          *    endfor
3417          * endfor
3418          */
3419         pr_info("Setting RMRR:\n");
3420         for_each_rmrr_units(rmrr) {
3421                 /* some BIOS lists non-exist devices in DMAR table. */
3422                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3423                                           i, dev) {
3424                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3425                         if (ret)
3426                                 pr_err("Mapping reserved region failed\n");
3427                 }
3428         }
3429
3430         iommu_prepare_isa();
3431
3432 domains_done:
3433
3434         /*
3435          * for each drhd
3436          *   enable fault log
3437          *   global invalidate context cache
3438          *   global invalidate iotlb
3439          *   enable translation
3440          */
3441         for_each_iommu(iommu, drhd) {
3442                 if (drhd->ignored) {
3443                         /*
3444                          * we always have to disable PMRs or DMA may fail on
3445                          * this device
3446                          */
3447                         if (force_on)
3448                                 iommu_disable_protect_mem_regions(iommu);
3449                         continue;
3450                 }
3451
3452                 iommu_flush_write_buffer(iommu);
3453
3454 #ifdef CONFIG_INTEL_IOMMU_SVM
3455                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3456                         ret = intel_svm_enable_prq(iommu);
3457                         if (ret)
3458                                 goto free_iommu;
3459                 }
3460 #endif
3461                 ret = dmar_set_interrupt(iommu);
3462                 if (ret)
3463                         goto free_iommu;
3464
3465                 if (!translation_pre_enabled(iommu))
3466                         iommu_enable_translation(iommu);
3467
3468                 iommu_disable_protect_mem_regions(iommu);
3469         }
3470
3471         return 0;
3472
3473 free_iommu:
3474         for_each_active_iommu(iommu, drhd) {
3475                 disable_dmar_iommu(iommu);
3476                 free_dmar_iommu(iommu);
3477         }
3478
3479         kfree(g_iommus);
3480
3481 error:
3482         return ret;
3483 }
3484
3485 /* This takes a number of _MM_ pages, not VTD pages */
3486 static unsigned long intel_alloc_iova(struct device *dev,
3487                                      struct dmar_domain *domain,
3488                                      unsigned long nrpages, uint64_t dma_mask)
3489 {
3490         unsigned long iova_pfn = 0;
3491
3492         /* Restrict dma_mask to the width that the iommu can handle */
3493         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3494         /* Ensure we reserve the whole size-aligned region */
3495         nrpages = __roundup_pow_of_two(nrpages);
3496
3497         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3498                 /*
3499                  * First try to allocate an io virtual address in
3500                  * DMA_BIT_MASK(32) and if that fails then try allocating
3501                  * from higher range
3502                  */
3503                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3504                                            IOVA_PFN(DMA_BIT_MASK(32)));
3505                 if (iova_pfn)
3506                         return iova_pfn;
3507         }
3508         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
3509         if (unlikely(!iova_pfn)) {
3510                 pr_err("Allocating %ld-page iova for %s failed",
3511                        nrpages, dev_name(dev));
3512                 return 0;
3513         }
3514
3515         return iova_pfn;
3516 }
3517
3518 static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3519 {
3520         struct dmar_domain *domain, *tmp;
3521         struct dmar_rmrr_unit *rmrr;
3522         struct device *i_dev;
3523         int i, ret;
3524
3525         domain = find_domain(dev);
3526         if (domain)
3527                 goto out;
3528
3529         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3530         if (!domain)
3531                 goto out;
3532
3533         /* We have a new domain - setup possible RMRRs for the device */
3534         rcu_read_lock();
3535         for_each_rmrr_units(rmrr) {
3536                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3537                                           i, i_dev) {
3538                         if (i_dev != dev)
3539                                 continue;
3540
3541                         ret = domain_prepare_identity_map(dev, domain,
3542                                                           rmrr->base_address,
3543                                                           rmrr->end_address);
3544                         if (ret)
3545                                 dev_err(dev, "Mapping reserved region failed\n");
3546                 }
3547         }
3548         rcu_read_unlock();
3549
3550         tmp = set_domain_for_dev(dev, domain);
3551         if (!tmp || domain != tmp) {
3552                 domain_exit(domain);
3553                 domain = tmp;
3554         }
3555
3556 out:
3557
3558         if (!domain)
3559                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3560
3561
3562         return domain;
3563 }
3564
3565 /* Check if the dev needs to go through non-identity map and unmap process.*/
3566 static int iommu_no_mapping(struct device *dev)
3567 {
3568         int found;
3569
3570         if (iommu_dummy(dev))
3571                 return 1;
3572
3573         if (!iommu_identity_mapping)
3574                 return 0;
3575
3576         found = identity_mapping(dev);
3577         if (found) {
3578                 if (iommu_should_identity_map(dev, 0))
3579                         return 1;
3580                 else {
3581                         /*
3582                          * 32 bit DMA is removed from si_domain and fall back
3583                          * to non-identity mapping.
3584                          */
3585                         dmar_remove_one_dev_info(si_domain, dev);
3586                         pr_info("32bit %s uses non-identity mapping\n",
3587                                 dev_name(dev));
3588                         return 0;
3589                 }
3590         } else {
3591                 /*
3592                  * In case of a detached 64 bit DMA device from vm, the device
3593                  * is put into si_domain for identity mapping.
3594                  */
3595                 if (iommu_should_identity_map(dev, 0)) {
3596                         int ret;
3597                         ret = domain_add_dev_info(si_domain, dev);
3598                         if (!ret) {
3599                                 pr_info("64bit %s uses identity mapping\n",
3600                                         dev_name(dev));
3601                                 return 1;
3602                         }
3603                 }
3604         }
3605
3606         return 0;
3607 }
3608
3609 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3610                                      size_t size, int dir, u64 dma_mask)
3611 {
3612         struct dmar_domain *domain;
3613         phys_addr_t start_paddr;
3614         unsigned long iova_pfn;
3615         int prot = 0;
3616         int ret;
3617         struct intel_iommu *iommu;
3618         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3619
3620         BUG_ON(dir == DMA_NONE);
3621
3622         if (iommu_no_mapping(dev))
3623                 return paddr;
3624
3625         domain = get_valid_domain_for_dev(dev);
3626         if (!domain)
3627                 return 0;
3628
3629         iommu = domain_get_iommu(domain);
3630         size = aligned_nrpages(paddr, size);
3631
3632         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3633         if (!iova_pfn)
3634                 goto error;
3635
3636         /*
3637          * Check if DMAR supports zero-length reads on write only
3638          * mappings..
3639          */
3640         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3641                         !cap_zlr(iommu->cap))
3642                 prot |= DMA_PTE_READ;
3643         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3644                 prot |= DMA_PTE_WRITE;
3645         /*
3646          * paddr - (paddr + size) might be partial page, we should map the whole
3647          * page.  Note: if two part of one page are separately mapped, we
3648          * might have two guest_addr mapping to the same host paddr, but this
3649          * is not a big problem
3650          */
3651         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3652                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3653         if (ret)
3654                 goto error;
3655
3656         /* it's a non-present to present mapping. Only flush if caching mode */
3657         if (cap_caching_mode(iommu->cap))
3658                 iommu_flush_iotlb_psi(iommu, domain,
3659                                       mm_to_dma_pfn(iova_pfn),
3660                                       size, 0, 1);
3661         else
3662                 iommu_flush_write_buffer(iommu);
3663
3664         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3665         start_paddr += paddr & ~PAGE_MASK;
3666         return start_paddr;
3667
3668 error:
3669         if (iova_pfn)
3670                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3671         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3672                 dev_name(dev), size, (unsigned long long)paddr, dir);
3673         return 0;
3674 }
3675
3676 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3677                                  unsigned long offset, size_t size,
3678                                  enum dma_data_direction dir,
3679                                  unsigned long attrs)
3680 {
3681         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3682                                   dir, *dev->dma_mask);
3683 }
3684
3685 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3686 {
3687         struct dmar_domain *domain;
3688         unsigned long start_pfn, last_pfn;
3689         unsigned long nrpages;
3690         unsigned long iova_pfn;
3691         struct intel_iommu *iommu;
3692         struct page *freelist;
3693
3694         if (iommu_no_mapping(dev))
3695                 return;
3696
3697         domain = find_domain(dev);
3698         BUG_ON(!domain);
3699
3700         iommu = domain_get_iommu(domain);
3701
3702         iova_pfn = IOVA_PFN(dev_addr);
3703
3704         nrpages = aligned_nrpages(dev_addr, size);
3705         start_pfn = mm_to_dma_pfn(iova_pfn);
3706         last_pfn = start_pfn + nrpages - 1;
3707
3708         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3709                  dev_name(dev), start_pfn, last_pfn);
3710
3711         freelist = domain_unmap(domain, start_pfn, last_pfn);
3712
3713         if (intel_iommu_strict || !has_iova_flush_queue(&domain->iovad)) {
3714                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3715                                       nrpages, !freelist, 0);
3716                 /* free iova */
3717                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3718                 dma_free_pagelist(freelist);
3719         } else {
3720                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3721                            (unsigned long)freelist);
3722                 /*
3723                  * queue up the release of the unmap to save the 1/6th of the
3724                  * cpu used up by the iotlb flush operation...
3725                  */
3726         }
3727 }
3728
3729 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3730                              size_t size, enum dma_data_direction dir,
3731                              unsigned long attrs)
3732 {
3733         intel_unmap(dev, dev_addr, size);
3734 }
3735
3736 static void *intel_alloc_coherent(struct device *dev, size_t size,
3737                                   dma_addr_t *dma_handle, gfp_t flags,
3738                                   unsigned long attrs)
3739 {
3740         struct page *page = NULL;
3741         int order;
3742
3743         size = PAGE_ALIGN(size);
3744         order = get_order(size);
3745
3746         if (!iommu_no_mapping(dev))
3747                 flags &= ~(GFP_DMA | GFP_DMA32);
3748         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3749                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3750                         flags |= GFP_DMA;
3751                 else
3752                         flags |= GFP_DMA32;
3753         }
3754
3755         if (gfpflags_allow_blocking(flags)) {
3756                 unsigned int count = size >> PAGE_SHIFT;
3757
3758                 page = dma_alloc_from_contiguous(dev, count, order, flags);
3759                 if (page && iommu_no_mapping(dev) &&
3760                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3761                         dma_release_from_contiguous(dev, page, count);
3762                         page = NULL;
3763                 }
3764         }
3765
3766         if (!page)
3767                 page = alloc_pages(flags, order);
3768         if (!page)
3769                 return NULL;
3770         memset(page_address(page), 0, size);
3771
3772         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3773                                          DMA_BIDIRECTIONAL,
3774                                          dev->coherent_dma_mask);
3775         if (*dma_handle)
3776                 return page_address(page);
3777         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3778                 __free_pages(page, order);
3779
3780         return NULL;
3781 }
3782
3783 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3784                                 dma_addr_t dma_handle, unsigned long attrs)
3785 {
3786         int order;
3787         struct page *page = virt_to_page(vaddr);
3788
3789         size = PAGE_ALIGN(size);
3790         order = get_order(size);
3791
3792         intel_unmap(dev, dma_handle, size);
3793         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3794                 __free_pages(page, order);
3795 }
3796
3797 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3798                            int nelems, enum dma_data_direction dir,
3799                            unsigned long attrs)
3800 {
3801         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3802         unsigned long nrpages = 0;
3803         struct scatterlist *sg;
3804         int i;
3805
3806         for_each_sg(sglist, sg, nelems, i) {
3807                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3808         }
3809
3810         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3811 }
3812
3813 static int intel_nontranslate_map_sg(struct device *hddev,
3814         struct scatterlist *sglist, int nelems, int dir)
3815 {
3816         int i;
3817         struct scatterlist *sg;
3818
3819         for_each_sg(sglist, sg, nelems, i) {
3820                 BUG_ON(!sg_page(sg));
3821                 sg->dma_address = sg_phys(sg);
3822                 sg->dma_length = sg->length;
3823         }
3824         return nelems;
3825 }
3826
3827 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3828                         enum dma_data_direction dir, unsigned long attrs)
3829 {
3830         int i;
3831         struct dmar_domain *domain;
3832         size_t size = 0;
3833         int prot = 0;
3834         unsigned long iova_pfn;
3835         int ret;
3836         struct scatterlist *sg;
3837         unsigned long start_vpfn;
3838         struct intel_iommu *iommu;
3839
3840         BUG_ON(dir == DMA_NONE);
3841         if (iommu_no_mapping(dev))
3842                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3843
3844         domain = get_valid_domain_for_dev(dev);
3845         if (!domain)
3846                 return 0;
3847
3848         iommu = domain_get_iommu(domain);
3849
3850         for_each_sg(sglist, sg, nelems, i)
3851                 size += aligned_nrpages(sg->offset, sg->length);
3852
3853         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3854                                 *dev->dma_mask);
3855         if (!iova_pfn) {
3856                 sglist->dma_length = 0;
3857                 return 0;
3858         }
3859
3860         /*
3861          * Check if DMAR supports zero-length reads on write only
3862          * mappings..
3863          */
3864         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3865                         !cap_zlr(iommu->cap))
3866                 prot |= DMA_PTE_READ;
3867         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3868                 prot |= DMA_PTE_WRITE;
3869
3870         start_vpfn = mm_to_dma_pfn(iova_pfn);
3871
3872         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3873         if (unlikely(ret)) {
3874                 dma_pte_free_pagetable(domain, start_vpfn,
3875                                        start_vpfn + size - 1,
3876                                        agaw_to_level(domain->agaw) + 1);
3877                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3878                 return 0;
3879         }
3880
3881         /* it's a non-present to present mapping. Only flush if caching mode */
3882         if (cap_caching_mode(iommu->cap))
3883                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3884         else
3885                 iommu_flush_write_buffer(iommu);
3886
3887         return nelems;
3888 }
3889
3890 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3891 {
3892         return !dma_addr;
3893 }
3894
3895 const struct dma_map_ops intel_dma_ops = {
3896         .alloc = intel_alloc_coherent,
3897         .free = intel_free_coherent,
3898         .map_sg = intel_map_sg,
3899         .unmap_sg = intel_unmap_sg,
3900         .map_page = intel_map_page,
3901         .unmap_page = intel_unmap_page,
3902         .mapping_error = intel_mapping_error,
3903 #ifdef CONFIG_X86
3904         .dma_supported = x86_dma_supported,
3905 #endif
3906 };
3907
3908 static inline int iommu_domain_cache_init(void)
3909 {
3910         int ret = 0;
3911
3912         iommu_domain_cache = kmem_cache_create("iommu_domain",
3913                                          sizeof(struct dmar_domain),
3914                                          0,
3915                                          SLAB_HWCACHE_ALIGN,
3916
3917                                          NULL);
3918         if (!iommu_domain_cache) {
3919                 pr_err("Couldn't create iommu_domain cache\n");
3920                 ret = -ENOMEM;
3921         }
3922
3923         return ret;
3924 }
3925
3926 static inline int iommu_devinfo_cache_init(void)
3927 {
3928         int ret = 0;
3929
3930         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3931                                          sizeof(struct device_domain_info),
3932                                          0,
3933                                          SLAB_HWCACHE_ALIGN,
3934                                          NULL);
3935         if (!iommu_devinfo_cache) {
3936                 pr_err("Couldn't create devinfo cache\n");
3937                 ret = -ENOMEM;
3938         }
3939
3940         return ret;
3941 }
3942
3943 static int __init iommu_init_mempool(void)
3944 {
3945         int ret;
3946         ret = iova_cache_get();
3947         if (ret)
3948                 return ret;
3949
3950         ret = iommu_domain_cache_init();
3951         if (ret)
3952                 goto domain_error;
3953
3954         ret = iommu_devinfo_cache_init();
3955         if (!ret)
3956                 return ret;
3957
3958         kmem_cache_destroy(iommu_domain_cache);
3959 domain_error:
3960         iova_cache_put();
3961
3962         return -ENOMEM;
3963 }
3964
3965 static void __init iommu_exit_mempool(void)
3966 {
3967         kmem_cache_destroy(iommu_devinfo_cache);
3968         kmem_cache_destroy(iommu_domain_cache);
3969         iova_cache_put();
3970 }
3971
3972 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3973 {
3974         struct dmar_drhd_unit *drhd;
3975         u32 vtbar;
3976         int rc;
3977
3978         /* We know that this device on this chipset has its own IOMMU.
3979          * If we find it under a different IOMMU, then the BIOS is lying
3980          * to us. Hope that the IOMMU for this device is actually
3981          * disabled, and it needs no translation...
3982          */
3983         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3984         if (rc) {
3985                 /* "can't" happen */
3986                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3987                 return;
3988         }
3989         vtbar &= 0xffff0000;
3990
3991         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3992         drhd = dmar_find_matched_drhd_unit(pdev);
3993         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3994                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
3995                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3996                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3997         }
3998 }
3999 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4000
4001 static void __init init_no_remapping_devices(void)
4002 {
4003         struct dmar_drhd_unit *drhd;
4004         struct device *dev;
4005         int i;
4006
4007         for_each_drhd_unit(drhd) {
4008                 if (!drhd->include_all) {
4009                         for_each_active_dev_scope(drhd->devices,
4010                                                   drhd->devices_cnt, i, dev)
4011                                 break;
4012                         /* ignore DMAR unit if no devices exist */
4013                         if (i == drhd->devices_cnt)
4014                                 drhd->ignored = 1;
4015                 }
4016         }
4017
4018         for_each_active_drhd_unit(drhd) {
4019                 if (drhd->include_all)
4020                         continue;
4021
4022                 for_each_active_dev_scope(drhd->devices,
4023                                           drhd->devices_cnt, i, dev)
4024                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4025                                 break;
4026                 if (i < drhd->devices_cnt)
4027                         continue;
4028
4029                 /* This IOMMU has *only* gfx devices. Either bypass it or
4030                    set the gfx_mapped flag, as appropriate */
4031                 if (!dmar_map_gfx) {
4032                         drhd->ignored = 1;
4033                         for_each_active_dev_scope(drhd->devices,
4034                                                   drhd->devices_cnt, i, dev)
4035                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4036                 }
4037         }
4038 }
4039
4040 #ifdef CONFIG_SUSPEND
4041 static int init_iommu_hw(void)
4042 {
4043         struct dmar_drhd_unit *drhd;
4044         struct intel_iommu *iommu = NULL;
4045
4046         for_each_active_iommu(iommu, drhd)
4047                 if (iommu->qi)
4048                         dmar_reenable_qi(iommu);
4049
4050         for_each_iommu(iommu, drhd) {
4051                 if (drhd->ignored) {
4052                         /*
4053                          * we always have to disable PMRs or DMA may fail on
4054                          * this device
4055                          */
4056                         if (force_on)
4057                                 iommu_disable_protect_mem_regions(iommu);
4058                         continue;
4059                 }
4060
4061                 iommu_flush_write_buffer(iommu);
4062
4063                 iommu_set_root_entry(iommu);
4064
4065                 iommu->flush.flush_context(iommu, 0, 0, 0,
4066                                            DMA_CCMD_GLOBAL_INVL);
4067                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4068                 iommu_enable_translation(iommu);
4069                 iommu_disable_protect_mem_regions(iommu);
4070         }
4071
4072         return 0;
4073 }
4074
4075 static void iommu_flush_all(void)
4076 {
4077         struct dmar_drhd_unit *drhd;
4078         struct intel_iommu *iommu;
4079
4080         for_each_active_iommu(iommu, drhd) {
4081                 iommu->flush.flush_context(iommu, 0, 0, 0,
4082                                            DMA_CCMD_GLOBAL_INVL);
4083                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4084                                          DMA_TLB_GLOBAL_FLUSH);
4085         }
4086 }
4087
4088 static int iommu_suspend(void)
4089 {
4090         struct dmar_drhd_unit *drhd;
4091         struct intel_iommu *iommu = NULL;
4092         unsigned long flag;
4093
4094         for_each_active_iommu(iommu, drhd) {
4095                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4096                                                  GFP_ATOMIC);
4097                 if (!iommu->iommu_state)
4098                         goto nomem;
4099         }
4100
4101         iommu_flush_all();
4102
4103         for_each_active_iommu(iommu, drhd) {
4104                 iommu_disable_translation(iommu);
4105
4106                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4107
4108                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4109                         readl(iommu->reg + DMAR_FECTL_REG);
4110                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4111                         readl(iommu->reg + DMAR_FEDATA_REG);
4112                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4113                         readl(iommu->reg + DMAR_FEADDR_REG);
4114                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4115                         readl(iommu->reg + DMAR_FEUADDR_REG);
4116
4117                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4118         }
4119         return 0;
4120
4121 nomem:
4122         for_each_active_iommu(iommu, drhd)
4123                 kfree(iommu->iommu_state);
4124
4125         return -ENOMEM;
4126 }
4127
4128 static void iommu_resume(void)
4129 {
4130         struct dmar_drhd_unit *drhd;
4131         struct intel_iommu *iommu = NULL;
4132         unsigned long flag;
4133
4134         if (init_iommu_hw()) {
4135                 if (force_on)
4136                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4137                 else
4138                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4139                 return;
4140         }
4141
4142         for_each_active_iommu(iommu, drhd) {
4143
4144                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4145
4146                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4147                         iommu->reg + DMAR_FECTL_REG);
4148                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4149                         iommu->reg + DMAR_FEDATA_REG);
4150                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4151                         iommu->reg + DMAR_FEADDR_REG);
4152                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4153                         iommu->reg + DMAR_FEUADDR_REG);
4154
4155                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4156         }
4157
4158         for_each_active_iommu(iommu, drhd)
4159                 kfree(iommu->iommu_state);
4160 }
4161
4162 static struct syscore_ops iommu_syscore_ops = {
4163         .resume         = iommu_resume,
4164         .suspend        = iommu_suspend,
4165 };
4166
4167 static void __init init_iommu_pm_ops(void)
4168 {
4169         register_syscore_ops(&iommu_syscore_ops);
4170 }
4171
4172 #else
4173 static inline void init_iommu_pm_ops(void) {}
4174 #endif  /* CONFIG_PM */
4175
4176
4177 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4178 {
4179         struct acpi_dmar_reserved_memory *rmrr;
4180         struct dmar_rmrr_unit *rmrru;
4181         size_t length;
4182
4183         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4184         if (!rmrru)
4185                 goto out;
4186
4187         rmrru->hdr = header;
4188         rmrr = (struct acpi_dmar_reserved_memory *)header;
4189         rmrru->base_address = rmrr->base_address;
4190         rmrru->end_address = rmrr->end_address;
4191
4192         length = rmrr->end_address - rmrr->base_address + 1;
4193
4194         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4195                                 ((void *)rmrr) + rmrr->header.length,
4196                                 &rmrru->devices_cnt);
4197         if (rmrru->devices_cnt && rmrru->devices == NULL)
4198                 goto free_rmrru;
4199
4200         list_add(&rmrru->list, &dmar_rmrr_units);
4201
4202         return 0;
4203 free_rmrru:
4204         kfree(rmrru);
4205 out:
4206         return -ENOMEM;
4207 }
4208
4209 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4210 {
4211         struct dmar_atsr_unit *atsru;
4212         struct acpi_dmar_atsr *tmp;
4213
4214         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4215                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4216                 if (atsr->segment != tmp->segment)
4217                         continue;
4218                 if (atsr->header.length != tmp->header.length)
4219                         continue;
4220                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4221                         return atsru;
4222         }
4223
4224         return NULL;
4225 }
4226
4227 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4228 {
4229         struct acpi_dmar_atsr *atsr;
4230         struct dmar_atsr_unit *atsru;
4231
4232         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4233                 return 0;
4234
4235         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4236         atsru = dmar_find_atsr(atsr);
4237         if (atsru)
4238                 return 0;
4239
4240         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4241         if (!atsru)
4242                 return -ENOMEM;
4243
4244         /*
4245          * If memory is allocated from slab by ACPI _DSM method, we need to
4246          * copy the memory content because the memory buffer will be freed
4247          * on return.
4248          */
4249         atsru->hdr = (void *)(atsru + 1);
4250         memcpy(atsru->hdr, hdr, hdr->length);
4251         atsru->include_all = atsr->flags & 0x1;
4252         if (!atsru->include_all) {
4253                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4254                                 (void *)atsr + atsr->header.length,
4255                                 &atsru->devices_cnt);
4256                 if (atsru->devices_cnt && atsru->devices == NULL) {
4257                         kfree(atsru);
4258                         return -ENOMEM;
4259                 }
4260         }
4261
4262         list_add_rcu(&atsru->list, &dmar_atsr_units);
4263
4264         return 0;
4265 }
4266
4267 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4268 {
4269         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4270         kfree(atsru);
4271 }
4272
4273 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4274 {
4275         struct acpi_dmar_atsr *atsr;
4276         struct dmar_atsr_unit *atsru;
4277
4278         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4279         atsru = dmar_find_atsr(atsr);
4280         if (atsru) {
4281                 list_del_rcu(&atsru->list);
4282                 synchronize_rcu();
4283                 intel_iommu_free_atsr(atsru);
4284         }
4285
4286         return 0;
4287 }
4288
4289 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4290 {
4291         int i;
4292         struct device *dev;
4293         struct acpi_dmar_atsr *atsr;
4294         struct dmar_atsr_unit *atsru;
4295
4296         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4297         atsru = dmar_find_atsr(atsr);
4298         if (!atsru)
4299                 return 0;
4300
4301         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4302                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4303                                           i, dev)
4304                         return -EBUSY;
4305         }
4306
4307         return 0;
4308 }
4309
4310 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4311 {
4312         int sp, ret = 0;
4313         struct intel_iommu *iommu = dmaru->iommu;
4314
4315         if (g_iommus[iommu->seq_id])
4316                 return 0;
4317
4318         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4319                 pr_warn("%s: Doesn't support hardware pass through.\n",
4320                         iommu->name);
4321                 return -ENXIO;
4322         }
4323         if (!ecap_sc_support(iommu->ecap) &&
4324             domain_update_iommu_snooping(iommu)) {
4325                 pr_warn("%s: Doesn't support snooping.\n",
4326                         iommu->name);
4327                 return -ENXIO;
4328         }
4329         sp = domain_update_iommu_superpage(iommu) - 1;
4330         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4331                 pr_warn("%s: Doesn't support large page.\n",
4332                         iommu->name);
4333                 return -ENXIO;
4334         }
4335
4336         /*
4337          * Disable translation if already enabled prior to OS handover.
4338          */
4339         if (iommu->gcmd & DMA_GCMD_TE)
4340                 iommu_disable_translation(iommu);
4341
4342         g_iommus[iommu->seq_id] = iommu;
4343         ret = iommu_init_domains(iommu);
4344         if (ret == 0)
4345                 ret = iommu_alloc_root_entry(iommu);
4346         if (ret)
4347                 goto out;
4348
4349 #ifdef CONFIG_INTEL_IOMMU_SVM
4350         if (pasid_enabled(iommu))
4351                 intel_svm_alloc_pasid_tables(iommu);
4352 #endif
4353
4354         if (dmaru->ignored) {
4355                 /*
4356                  * we always have to disable PMRs or DMA may fail on this device
4357                  */
4358                 if (force_on)
4359                         iommu_disable_protect_mem_regions(iommu);
4360                 return 0;
4361         }
4362
4363         intel_iommu_init_qi(iommu);
4364         iommu_flush_write_buffer(iommu);
4365
4366 #ifdef CONFIG_INTEL_IOMMU_SVM
4367         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4368                 ret = intel_svm_enable_prq(iommu);
4369                 if (ret)
4370                         goto disable_iommu;
4371         }
4372 #endif
4373         ret = dmar_set_interrupt(iommu);
4374         if (ret)
4375                 goto disable_iommu;
4376
4377         iommu_set_root_entry(iommu);
4378         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4379         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4380         iommu_enable_translation(iommu);
4381
4382         iommu_disable_protect_mem_regions(iommu);
4383         return 0;
4384
4385 disable_iommu:
4386         disable_dmar_iommu(iommu);
4387 out:
4388         free_dmar_iommu(iommu);
4389         return ret;
4390 }
4391
4392 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4393 {
4394         int ret = 0;
4395         struct intel_iommu *iommu = dmaru->iommu;
4396
4397         if (!intel_iommu_enabled)
4398                 return 0;
4399         if (iommu == NULL)
4400                 return -EINVAL;
4401
4402         if (insert) {
4403                 ret = intel_iommu_add(dmaru);
4404         } else {
4405                 disable_dmar_iommu(iommu);
4406                 free_dmar_iommu(iommu);
4407         }
4408
4409         return ret;
4410 }
4411
4412 static void intel_iommu_free_dmars(void)
4413 {
4414         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4415         struct dmar_atsr_unit *atsru, *atsr_n;
4416
4417         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4418                 list_del(&rmrru->list);
4419                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4420                 kfree(rmrru);
4421         }
4422
4423         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4424                 list_del(&atsru->list);
4425                 intel_iommu_free_atsr(atsru);
4426         }
4427 }
4428
4429 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4430 {
4431         int i, ret = 1;
4432         struct pci_bus *bus;
4433         struct pci_dev *bridge = NULL;
4434         struct device *tmp;
4435         struct acpi_dmar_atsr *atsr;
4436         struct dmar_atsr_unit *atsru;
4437
4438         dev = pci_physfn(dev);
4439         for (bus = dev->bus; bus; bus = bus->parent) {
4440                 bridge = bus->self;
4441                 /* If it's an integrated device, allow ATS */
4442                 if (!bridge)
4443                         return 1;
4444                 /* Connected via non-PCIe: no ATS */
4445                 if (!pci_is_pcie(bridge) ||
4446                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4447                         return 0;
4448                 /* If we found the root port, look it up in the ATSR */
4449                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4450                         break;
4451         }
4452
4453         rcu_read_lock();
4454         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4455                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4456                 if (atsr->segment != pci_domain_nr(dev->bus))
4457                         continue;
4458
4459                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4460                         if (tmp == &bridge->dev)
4461                                 goto out;
4462
4463                 if (atsru->include_all)
4464                         goto out;
4465         }
4466         ret = 0;
4467 out:
4468         rcu_read_unlock();
4469
4470         return ret;
4471 }
4472
4473 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4474 {
4475         int ret = 0;
4476         struct dmar_rmrr_unit *rmrru;
4477         struct dmar_atsr_unit *atsru;
4478         struct acpi_dmar_atsr *atsr;
4479         struct acpi_dmar_reserved_memory *rmrr;
4480
4481         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4482                 return 0;
4483
4484         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4485                 rmrr = container_of(rmrru->hdr,
4486                                     struct acpi_dmar_reserved_memory, header);
4487                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4488                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4489                                 ((void *)rmrr) + rmrr->header.length,
4490                                 rmrr->segment, rmrru->devices,
4491                                 rmrru->devices_cnt);
4492                         if(ret < 0)
4493                                 return ret;
4494                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4495                         dmar_remove_dev_scope(info, rmrr->segment,
4496                                 rmrru->devices, rmrru->devices_cnt);
4497                 }
4498         }
4499
4500         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4501                 if (atsru->include_all)
4502                         continue;
4503
4504                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4505                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4506                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4507                                         (void *)atsr + atsr->header.length,
4508                                         atsr->segment, atsru->devices,
4509                                         atsru->devices_cnt);
4510                         if (ret > 0)
4511                                 break;
4512                         else if(ret < 0)
4513                                 return ret;
4514                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4515                         if (dmar_remove_dev_scope(info, atsr->segment,
4516                                         atsru->devices, atsru->devices_cnt))
4517                                 break;
4518                 }
4519         }
4520
4521         return 0;
4522 }
4523
4524 /*
4525  * Here we only respond to action of unbound device from driver.
4526  *
4527  * Added device is not attached to its DMAR domain here yet. That will happen
4528  * when mapping the device to iova.
4529  */
4530 static int device_notifier(struct notifier_block *nb,
4531                                   unsigned long action, void *data)
4532 {
4533         struct device *dev = data;
4534         struct dmar_domain *domain;
4535
4536         if (iommu_dummy(dev))
4537                 return 0;
4538
4539         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4540                 return 0;
4541
4542         domain = find_domain(dev);
4543         if (!domain)
4544                 return 0;
4545
4546         dmar_remove_one_dev_info(domain, dev);
4547         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4548                 domain_exit(domain);
4549
4550         return 0;
4551 }
4552
4553 static struct notifier_block device_nb = {
4554         .notifier_call = device_notifier,
4555 };
4556
4557 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4558                                        unsigned long val, void *v)
4559 {
4560         struct memory_notify *mhp = v;
4561         unsigned long long start, end;
4562         unsigned long start_vpfn, last_vpfn;
4563
4564         switch (val) {
4565         case MEM_GOING_ONLINE:
4566                 start = mhp->start_pfn << PAGE_SHIFT;
4567                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4568                 if (iommu_domain_identity_map(si_domain, start, end)) {
4569                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4570                                 start, end);
4571                         return NOTIFY_BAD;
4572                 }
4573                 break;
4574
4575         case MEM_OFFLINE:
4576         case MEM_CANCEL_ONLINE:
4577                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4578                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4579                 while (start_vpfn <= last_vpfn) {
4580                         struct iova *iova;
4581                         struct dmar_drhd_unit *drhd;
4582                         struct intel_iommu *iommu;
4583                         struct page *freelist;
4584
4585                         iova = find_iova(&si_domain->iovad, start_vpfn);
4586                         if (iova == NULL) {
4587                                 pr_debug("Failed get IOVA for PFN %lx\n",
4588                                          start_vpfn);
4589                                 break;
4590                         }
4591
4592                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4593                                                      start_vpfn, last_vpfn);
4594                         if (iova == NULL) {
4595                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4596                                         start_vpfn, last_vpfn);
4597                                 return NOTIFY_BAD;
4598                         }
4599
4600                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4601                                                iova->pfn_hi);
4602
4603                         rcu_read_lock();
4604                         for_each_active_iommu(iommu, drhd)
4605                                 iommu_flush_iotlb_psi(iommu, si_domain,
4606                                         iova->pfn_lo, iova_size(iova),
4607                                         !freelist, 0);
4608                         rcu_read_unlock();
4609                         dma_free_pagelist(freelist);
4610
4611                         start_vpfn = iova->pfn_hi + 1;
4612                         free_iova_mem(iova);
4613                 }
4614                 break;
4615         }
4616
4617         return NOTIFY_OK;
4618 }
4619
4620 static struct notifier_block intel_iommu_memory_nb = {
4621         .notifier_call = intel_iommu_memory_notifier,
4622         .priority = 0
4623 };
4624
4625 static void free_all_cpu_cached_iovas(unsigned int cpu)
4626 {
4627         int i;
4628
4629         for (i = 0; i < g_num_of_iommus; i++) {
4630                 struct intel_iommu *iommu = g_iommus[i];
4631                 struct dmar_domain *domain;
4632                 int did;
4633
4634                 if (!iommu)
4635                         continue;
4636
4637                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4638                         domain = get_iommu_domain(iommu, (u16)did);
4639
4640                         if (!domain)
4641                                 continue;
4642                         free_cpu_cached_iovas(cpu, &domain->iovad);
4643                 }
4644         }
4645 }
4646
4647 static int intel_iommu_cpu_dead(unsigned int cpu)
4648 {
4649         free_all_cpu_cached_iovas(cpu);
4650         return 0;
4651 }
4652
4653 static void intel_disable_iommus(void)
4654 {
4655         struct intel_iommu *iommu = NULL;
4656         struct dmar_drhd_unit *drhd;
4657
4658         for_each_iommu(iommu, drhd)
4659                 iommu_disable_translation(iommu);
4660 }
4661
4662 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4663 {
4664         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4665
4666         return container_of(iommu_dev, struct intel_iommu, iommu);
4667 }
4668
4669 static ssize_t intel_iommu_show_version(struct device *dev,
4670                                         struct device_attribute *attr,
4671                                         char *buf)
4672 {
4673         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4674         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4675         return sprintf(buf, "%d:%d\n",
4676                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4677 }
4678 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4679
4680 static ssize_t intel_iommu_show_address(struct device *dev,
4681                                         struct device_attribute *attr,
4682                                         char *buf)
4683 {
4684         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4685         return sprintf(buf, "%llx\n", iommu->reg_phys);
4686 }
4687 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4688
4689 static ssize_t intel_iommu_show_cap(struct device *dev,
4690                                     struct device_attribute *attr,
4691                                     char *buf)
4692 {
4693         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4694         return sprintf(buf, "%llx\n", iommu->cap);
4695 }
4696 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4697
4698 static ssize_t intel_iommu_show_ecap(struct device *dev,
4699                                     struct device_attribute *attr,
4700                                     char *buf)
4701 {
4702         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4703         return sprintf(buf, "%llx\n", iommu->ecap);
4704 }
4705 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4706
4707 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4708                                       struct device_attribute *attr,
4709                                       char *buf)
4710 {
4711         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4712         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4713 }
4714 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4715
4716 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4717                                            struct device_attribute *attr,
4718                                            char *buf)
4719 {
4720         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4721         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4722                                                   cap_ndoms(iommu->cap)));
4723 }
4724 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4725
4726 static struct attribute *intel_iommu_attrs[] = {
4727         &dev_attr_version.attr,
4728         &dev_attr_address.attr,
4729         &dev_attr_cap.attr,
4730         &dev_attr_ecap.attr,
4731         &dev_attr_domains_supported.attr,
4732         &dev_attr_domains_used.attr,
4733         NULL,
4734 };
4735
4736 static struct attribute_group intel_iommu_group = {
4737         .name = "intel-iommu",
4738         .attrs = intel_iommu_attrs,
4739 };
4740
4741 const struct attribute_group *intel_iommu_groups[] = {
4742         &intel_iommu_group,
4743         NULL,
4744 };
4745
4746 int __init intel_iommu_init(void)
4747 {
4748         int ret = -ENODEV;
4749         struct dmar_drhd_unit *drhd;
4750         struct intel_iommu *iommu;
4751
4752         /* VT-d is required for a TXT/tboot launch, so enforce that */
4753         force_on = tboot_force_iommu();
4754
4755         if (iommu_init_mempool()) {
4756                 if (force_on)
4757                         panic("tboot: Failed to initialize iommu memory\n");
4758                 return -ENOMEM;
4759         }
4760
4761         down_write(&dmar_global_lock);
4762         if (dmar_table_init()) {
4763                 if (force_on)
4764                         panic("tboot: Failed to initialize DMAR table\n");
4765                 goto out_free_dmar;
4766         }
4767
4768         if (dmar_dev_scope_init() < 0) {
4769                 if (force_on)
4770                         panic("tboot: Failed to initialize DMAR device scope\n");
4771                 goto out_free_dmar;
4772         }
4773
4774         if (no_iommu || dmar_disabled) {
4775                 /*
4776                  * We exit the function here to ensure IOMMU's remapping and
4777                  * mempool aren't setup, which means that the IOMMU's PMRs
4778                  * won't be disabled via the call to init_dmars(). So disable
4779                  * it explicitly here. The PMRs were setup by tboot prior to
4780                  * calling SENTER, but the kernel is expected to reset/tear
4781                  * down the PMRs.
4782                  */
4783                 if (intel_iommu_tboot_noforce) {
4784                         for_each_iommu(iommu, drhd)
4785                                 iommu_disable_protect_mem_regions(iommu);
4786                 }
4787
4788                 /*
4789                  * Make sure the IOMMUs are switched off, even when we
4790                  * boot into a kexec kernel and the previous kernel left
4791                  * them enabled
4792                  */
4793                 intel_disable_iommus();
4794                 goto out_free_dmar;
4795         }
4796
4797         if (list_empty(&dmar_rmrr_units))
4798                 pr_info("No RMRR found\n");
4799
4800         if (list_empty(&dmar_atsr_units))
4801                 pr_info("No ATSR found\n");
4802
4803         if (dmar_init_reserved_ranges()) {
4804                 if (force_on)
4805                         panic("tboot: Failed to reserve iommu ranges\n");
4806                 goto out_free_reserved_range;
4807         }
4808
4809         if (dmar_map_gfx)
4810                 intel_iommu_gfx_mapped = 1;
4811
4812         init_no_remapping_devices();
4813
4814         ret = init_dmars();
4815         if (ret) {
4816                 if (force_on)
4817                         panic("tboot: Failed to initialize DMARs\n");
4818                 pr_err("Initialization failed\n");
4819                 goto out_free_reserved_range;
4820         }
4821         up_write(&dmar_global_lock);
4822         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4823
4824 #ifdef CONFIG_SWIOTLB
4825         swiotlb = 0;
4826 #endif
4827         dma_ops = &intel_dma_ops;
4828
4829         init_iommu_pm_ops();
4830
4831         for_each_active_iommu(iommu, drhd) {
4832                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4833                                        intel_iommu_groups,
4834                                        "%s", iommu->name);
4835                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4836                 iommu_device_register(&iommu->iommu);
4837         }
4838
4839         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4840         bus_register_notifier(&pci_bus_type, &device_nb);
4841         if (si_domain && !hw_pass_through)
4842                 register_memory_notifier(&intel_iommu_memory_nb);
4843         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4844                           intel_iommu_cpu_dead);
4845         intel_iommu_enabled = 1;
4846
4847         return 0;
4848
4849 out_free_reserved_range:
4850         put_iova_domain(&reserved_iova_list);
4851 out_free_dmar:
4852         intel_iommu_free_dmars();
4853         up_write(&dmar_global_lock);
4854         iommu_exit_mempool();
4855         return ret;
4856 }
4857
4858 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4859 {
4860         struct intel_iommu *iommu = opaque;
4861
4862         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4863         return 0;
4864 }
4865
4866 /*
4867  * NB - intel-iommu lacks any sort of reference counting for the users of
4868  * dependent devices.  If multiple endpoints have intersecting dependent
4869  * devices, unbinding the driver from any one of them will possibly leave
4870  * the others unable to operate.
4871  */
4872 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4873 {
4874         if (!iommu || !dev || !dev_is_pci(dev))
4875                 return;
4876
4877         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4878 }
4879
4880 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4881 {
4882         struct intel_iommu *iommu;
4883         unsigned long flags;
4884
4885         assert_spin_locked(&device_domain_lock);
4886
4887         if (WARN_ON(!info))
4888                 return;
4889
4890         iommu = info->iommu;
4891
4892         if (info->dev) {
4893                 iommu_disable_dev_iotlb(info);
4894                 domain_context_clear(iommu, info->dev);
4895         }
4896
4897         unlink_domain_info(info);
4898
4899         spin_lock_irqsave(&iommu->lock, flags);
4900         domain_detach_iommu(info->domain, iommu);
4901         spin_unlock_irqrestore(&iommu->lock, flags);
4902
4903         free_devinfo_mem(info);
4904 }
4905
4906 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4907                                      struct device *dev)
4908 {
4909         struct device_domain_info *info;
4910         unsigned long flags;
4911
4912         spin_lock_irqsave(&device_domain_lock, flags);
4913         info = dev->archdata.iommu;
4914         __dmar_remove_one_dev_info(info);
4915         spin_unlock_irqrestore(&device_domain_lock, flags);
4916 }
4917
4918 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4919 {
4920         int adjust_width;
4921
4922         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4923                         DMA_32BIT_PFN);
4924         domain_reserve_special_ranges(domain);
4925
4926         /* calculate AGAW */
4927         domain->gaw = guest_width;
4928         adjust_width = guestwidth_to_adjustwidth(guest_width);
4929         domain->agaw = width_to_agaw(adjust_width);
4930
4931         domain->iommu_coherency = 0;
4932         domain->iommu_snooping = 0;
4933         domain->iommu_superpage = 0;
4934         domain->max_addr = 0;
4935
4936         /* always allocate the top pgd */
4937         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4938         if (!domain->pgd)
4939                 return -ENOMEM;
4940         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4941         return 0;
4942 }
4943
4944 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4945 {
4946         struct dmar_domain *dmar_domain;
4947         struct iommu_domain *domain;
4948
4949         if (type != IOMMU_DOMAIN_UNMANAGED)
4950                 return NULL;
4951
4952         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4953         if (!dmar_domain) {
4954                 pr_err("Can't allocate dmar_domain\n");
4955                 return NULL;
4956         }
4957         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4958                 pr_err("Domain initialization failed\n");
4959                 domain_exit(dmar_domain);
4960                 return NULL;
4961         }
4962         domain_update_iommu_cap(dmar_domain);
4963
4964         domain = &dmar_domain->domain;
4965         domain->geometry.aperture_start = 0;
4966         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4967         domain->geometry.force_aperture = true;
4968
4969         return domain;
4970 }
4971
4972 static void intel_iommu_domain_free(struct iommu_domain *domain)
4973 {
4974         domain_exit(to_dmar_domain(domain));
4975 }
4976
4977 static int intel_iommu_attach_device(struct iommu_domain *domain,
4978                                      struct device *dev)
4979 {
4980         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4981         struct intel_iommu *iommu;
4982         int addr_width;
4983         u8 bus, devfn;
4984
4985         if (device_is_rmrr_locked(dev)) {
4986                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4987                 return -EPERM;
4988         }
4989
4990         /* normally dev is not mapped */
4991         if (unlikely(domain_context_mapped(dev))) {
4992                 struct dmar_domain *old_domain;
4993
4994                 old_domain = find_domain(dev);
4995                 if (old_domain) {
4996                         rcu_read_lock();
4997                         dmar_remove_one_dev_info(old_domain, dev);
4998                         rcu_read_unlock();
4999
5000                         if (!domain_type_is_vm_or_si(old_domain) &&
5001                              list_empty(&old_domain->devices))
5002                                 domain_exit(old_domain);
5003                 }
5004         }
5005
5006         iommu = device_to_iommu(dev, &bus, &devfn);
5007         if (!iommu)
5008                 return -ENODEV;
5009
5010         /* check if this iommu agaw is sufficient for max mapped address */
5011         addr_width = agaw_to_width(iommu->agaw);
5012         if (addr_width > cap_mgaw(iommu->cap))
5013                 addr_width = cap_mgaw(iommu->cap);
5014
5015         if (dmar_domain->max_addr > (1LL << addr_width)) {
5016                 pr_err("%s: iommu width (%d) is not "
5017                        "sufficient for the mapped address (%llx)\n",
5018                        __func__, addr_width, dmar_domain->max_addr);
5019                 return -EFAULT;
5020         }
5021         dmar_domain->gaw = addr_width;
5022
5023         /*
5024          * Knock out extra levels of page tables if necessary
5025          */
5026         while (iommu->agaw < dmar_domain->agaw) {
5027                 struct dma_pte *pte;
5028
5029                 pte = dmar_domain->pgd;
5030                 if (dma_pte_present(pte)) {
5031                         dmar_domain->pgd = (struct dma_pte *)
5032                                 phys_to_virt(dma_pte_addr(pte));
5033                         free_pgtable_page(pte);
5034                 }
5035                 dmar_domain->agaw--;
5036         }
5037
5038         return domain_add_dev_info(dmar_domain, dev);
5039 }
5040
5041 static void intel_iommu_detach_device(struct iommu_domain *domain,
5042                                       struct device *dev)
5043 {
5044         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5045 }
5046
5047 static int intel_iommu_map(struct iommu_domain *domain,
5048                            unsigned long iova, phys_addr_t hpa,
5049                            size_t size, int iommu_prot)
5050 {
5051         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5052         u64 max_addr;
5053         int prot = 0;
5054         int ret;
5055
5056         if (iommu_prot & IOMMU_READ)
5057                 prot |= DMA_PTE_READ;
5058         if (iommu_prot & IOMMU_WRITE)
5059                 prot |= DMA_PTE_WRITE;
5060         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5061                 prot |= DMA_PTE_SNP;
5062
5063         max_addr = iova + size;
5064         if (dmar_domain->max_addr < max_addr) {
5065                 u64 end;
5066
5067                 /* check if minimum agaw is sufficient for mapped address */
5068                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5069                 if (end < max_addr) {
5070                         pr_err("%s: iommu width (%d) is not "
5071                                "sufficient for the mapped address (%llx)\n",
5072                                __func__, dmar_domain->gaw, max_addr);
5073                         return -EFAULT;
5074                 }
5075                 dmar_domain->max_addr = max_addr;
5076         }
5077         /* Round up size to next multiple of PAGE_SIZE, if it and
5078            the low bits of hpa would take us onto the next page */
5079         size = aligned_nrpages(hpa, size);
5080         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5081                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5082         return ret;
5083 }
5084
5085 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5086                                 unsigned long iova, size_t size)
5087 {
5088         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5089         struct page *freelist = NULL;
5090         struct intel_iommu *iommu;
5091         unsigned long start_pfn, last_pfn;
5092         unsigned int npages;
5093         int iommu_id, level = 0;
5094
5095         /* Cope with horrid API which requires us to unmap more than the
5096            size argument if it happens to be a large-page mapping. */
5097         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5098
5099         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5100                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5101
5102         start_pfn = iova >> VTD_PAGE_SHIFT;
5103         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5104
5105         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5106
5107         npages = last_pfn - start_pfn + 1;
5108
5109         for_each_domain_iommu(iommu_id, dmar_domain) {
5110                 iommu = g_iommus[iommu_id];
5111
5112                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5113                                       start_pfn, npages, !freelist, 0);
5114         }
5115
5116         dma_free_pagelist(freelist);
5117
5118         if (dmar_domain->max_addr == iova + size)
5119                 dmar_domain->max_addr = iova;
5120
5121         return size;
5122 }
5123
5124 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5125                                             dma_addr_t iova)
5126 {
5127         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5128         struct dma_pte *pte;
5129         int level = 0;
5130         u64 phys = 0;
5131
5132         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5133         if (pte && dma_pte_present(pte))
5134                 phys = dma_pte_addr(pte) +
5135                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5136                                                 VTD_PAGE_SHIFT) - 1));
5137
5138         return phys;
5139 }
5140
5141 static bool intel_iommu_capable(enum iommu_cap cap)
5142 {
5143         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5144                 return domain_update_iommu_snooping(NULL) == 1;
5145         if (cap == IOMMU_CAP_INTR_REMAP)
5146                 return irq_remapping_enabled == 1;
5147
5148         return false;
5149 }
5150
5151 static int intel_iommu_add_device(struct device *dev)
5152 {
5153         struct intel_iommu *iommu;
5154         struct iommu_group *group;
5155         u8 bus, devfn;
5156
5157         iommu = device_to_iommu(dev, &bus, &devfn);
5158         if (!iommu)
5159                 return -ENODEV;
5160
5161         iommu_device_link(&iommu->iommu, dev);
5162
5163         group = iommu_group_get_for_dev(dev);
5164
5165         if (IS_ERR(group))
5166                 return PTR_ERR(group);
5167
5168         iommu_group_put(group);
5169         return 0;
5170 }
5171
5172 static void intel_iommu_remove_device(struct device *dev)
5173 {
5174         struct intel_iommu *iommu;
5175         u8 bus, devfn;
5176
5177         iommu = device_to_iommu(dev, &bus, &devfn);
5178         if (!iommu)
5179                 return;
5180
5181         iommu_group_remove_device(dev);
5182
5183         iommu_device_unlink(&iommu->iommu, dev);
5184 }
5185
5186 static void intel_iommu_get_resv_regions(struct device *device,
5187                                          struct list_head *head)
5188 {
5189         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5190         struct iommu_resv_region *reg;
5191         struct dmar_rmrr_unit *rmrr;
5192         struct device *i_dev;
5193         int i;
5194
5195         down_read(&dmar_global_lock);
5196         for_each_rmrr_units(rmrr) {
5197                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5198                                           i, i_dev) {
5199                         struct iommu_resv_region *resv;
5200                         size_t length;
5201
5202                         if (i_dev != device)
5203                                 continue;
5204
5205                         length = rmrr->end_address - rmrr->base_address + 1;
5206                         resv = iommu_alloc_resv_region(rmrr->base_address,
5207                                                        length, prot,
5208                                                        IOMMU_RESV_DIRECT);
5209                         if (!resv)
5210                                 break;
5211
5212                         list_add_tail(&resv->list, head);
5213                 }
5214         }
5215         up_read(&dmar_global_lock);
5216
5217         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5218                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5219                                       0, IOMMU_RESV_MSI);
5220         if (!reg)
5221                 return;
5222         list_add_tail(&reg->list, head);
5223 }
5224
5225 static void intel_iommu_put_resv_regions(struct device *dev,
5226                                          struct list_head *head)
5227 {
5228         struct iommu_resv_region *entry, *next;
5229
5230         list_for_each_entry_safe(entry, next, head, list)
5231                 kfree(entry);
5232 }
5233
5234 #ifdef CONFIG_INTEL_IOMMU_SVM
5235 #define MAX_NR_PASID_BITS (20)
5236 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5237 {
5238         /*
5239          * Convert ecap_pss to extend context entry pts encoding, also
5240          * respect the soft pasid_max value set by the iommu.
5241          * - number of PASID bits = ecap_pss + 1
5242          * - number of PASID table entries = 2^(pts + 5)
5243          * Therefore, pts = ecap_pss - 4
5244          * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5245          */
5246         if (ecap_pss(iommu->ecap) < 5)
5247                 return 0;
5248
5249         /* pasid_max is encoded as actual number of entries not the bits */
5250         return find_first_bit((unsigned long *)&iommu->pasid_max,
5251                         MAX_NR_PASID_BITS) - 5;
5252 }
5253
5254 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5255 {
5256         struct device_domain_info *info;
5257         struct context_entry *context;
5258         struct dmar_domain *domain;
5259         unsigned long flags;
5260         u64 ctx_lo;
5261         int ret;
5262
5263         domain = get_valid_domain_for_dev(sdev->dev);
5264         if (!domain)
5265                 return -EINVAL;
5266
5267         spin_lock_irqsave(&device_domain_lock, flags);
5268         spin_lock(&iommu->lock);
5269
5270         ret = -EINVAL;
5271         info = sdev->dev->archdata.iommu;
5272         if (!info || !info->pasid_supported)
5273                 goto out;
5274
5275         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5276         if (WARN_ON(!context))
5277                 goto out;
5278
5279         ctx_lo = context[0].lo;
5280
5281         sdev->did = domain->iommu_did[iommu->seq_id];
5282         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5283
5284         if (!(ctx_lo & CONTEXT_PASIDE)) {
5285                 if (iommu->pasid_state_table)
5286                         context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5287                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5288                         intel_iommu_get_pts(iommu);
5289
5290                 wmb();
5291                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5292                  * extended to permit requests-with-PASID if the PASIDE bit
5293                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5294                  * however, the PASIDE bit is ignored and requests-with-PASID
5295                  * are unconditionally blocked. Which makes less sense.
5296                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5297                  * "guest mode" translation types depending on whether ATS
5298                  * is available or not. Annoyingly, we can't use the new
5299                  * modes *unless* PASIDE is set. */
5300                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5301                         ctx_lo &= ~CONTEXT_TT_MASK;
5302                         if (info->ats_supported)
5303                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5304                         else
5305                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5306                 }
5307                 ctx_lo |= CONTEXT_PASIDE;
5308                 if (iommu->pasid_state_table)
5309                         ctx_lo |= CONTEXT_DINVE;
5310                 if (info->pri_supported)
5311                         ctx_lo |= CONTEXT_PRS;
5312                 context[0].lo = ctx_lo;
5313                 wmb();
5314                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5315                                            DMA_CCMD_MASK_NOBIT,
5316                                            DMA_CCMD_DEVICE_INVL);
5317         }
5318
5319         /* Enable PASID support in the device, if it wasn't already */
5320         if (!info->pasid_enabled)
5321                 iommu_enable_dev_iotlb(info);
5322
5323         if (info->ats_enabled) {
5324                 sdev->dev_iotlb = 1;
5325                 sdev->qdep = info->ats_qdep;
5326                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5327                         sdev->qdep = 0;
5328         }
5329         ret = 0;
5330
5331  out:
5332         spin_unlock(&iommu->lock);
5333         spin_unlock_irqrestore(&device_domain_lock, flags);
5334
5335         return ret;
5336 }
5337
5338 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5339 {
5340         struct intel_iommu *iommu;
5341         u8 bus, devfn;
5342
5343         if (iommu_dummy(dev)) {
5344                 dev_warn(dev,
5345                          "No IOMMU translation for device; cannot enable SVM\n");
5346                 return NULL;
5347         }
5348
5349         iommu = device_to_iommu(dev, &bus, &devfn);
5350         if ((!iommu)) {
5351                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5352                 return NULL;
5353         }
5354
5355         if (!iommu->pasid_table) {
5356                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5357                 return NULL;
5358         }
5359
5360         return iommu;
5361 }
5362 #endif /* CONFIG_INTEL_IOMMU_SVM */
5363
5364 const struct iommu_ops intel_iommu_ops = {
5365         .capable                = intel_iommu_capable,
5366         .domain_alloc           = intel_iommu_domain_alloc,
5367         .domain_free            = intel_iommu_domain_free,
5368         .attach_dev             = intel_iommu_attach_device,
5369         .detach_dev             = intel_iommu_detach_device,
5370         .map                    = intel_iommu_map,
5371         .unmap                  = intel_iommu_unmap,
5372         .map_sg                 = default_iommu_map_sg,
5373         .iova_to_phys           = intel_iommu_iova_to_phys,
5374         .add_device             = intel_iommu_add_device,
5375         .remove_device          = intel_iommu_remove_device,
5376         .get_resv_regions       = intel_iommu_get_resv_regions,
5377         .put_resv_regions       = intel_iommu_put_resv_regions,
5378         .device_group           = pci_device_group,
5379         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5380 };
5381
5382 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5383 {
5384         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5385         pr_info("Disabling IOMMU for graphics on this chipset\n");
5386         dmar_map_gfx = 0;
5387 }
5388
5389 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5390 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5391 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5392 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5393 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5394 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5395 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5396
5397 static void quirk_iommu_rwbf(struct pci_dev *dev)
5398 {
5399         /*
5400          * Mobile 4 Series Chipset neglects to set RWBF capability,
5401          * but needs it. Same seems to hold for the desktop versions.
5402          */
5403         pr_info("Forcing write-buffer flush capability\n");
5404         rwbf_quirk = 1;
5405 }
5406
5407 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5408 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5409 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5414
5415 #define GGC 0x52
5416 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5417 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5418 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5419 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5420 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5421 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5422 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5423 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5424
5425 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5426 {
5427         unsigned short ggc;
5428
5429         if (pci_read_config_word(dev, GGC, &ggc))
5430                 return;
5431
5432         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5433                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5434                 dmar_map_gfx = 0;
5435         } else if (dmar_map_gfx) {
5436                 /* we have to ensure the gfx device is idle before we flush */
5437                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5438                 intel_iommu_strict = 1;
5439        }
5440 }
5441 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5442 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5443 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5444 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5445
5446 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5447    ISOCH DMAR unit for the Azalia sound device, but not give it any
5448    TLB entries, which causes it to deadlock. Check for that.  We do
5449    this in a function called from init_dmars(), instead of in a PCI
5450    quirk, because we don't want to print the obnoxious "BIOS broken"
5451    message if VT-d is actually disabled.
5452 */
5453 static void __init check_tylersburg_isoch(void)
5454 {
5455         struct pci_dev *pdev;
5456         uint32_t vtisochctrl;
5457
5458         /* If there's no Azalia in the system anyway, forget it. */
5459         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5460         if (!pdev)
5461                 return;
5462         pci_dev_put(pdev);
5463
5464         /* System Management Registers. Might be hidden, in which case
5465            we can't do the sanity check. But that's OK, because the
5466            known-broken BIOSes _don't_ actually hide it, so far. */
5467         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5468         if (!pdev)
5469                 return;
5470
5471         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5472                 pci_dev_put(pdev);
5473                 return;
5474         }
5475
5476         pci_dev_put(pdev);
5477
5478         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5479         if (vtisochctrl & 1)
5480                 return;
5481
5482         /* Drop all bits other than the number of TLB entries */
5483         vtisochctrl &= 0x1c;
5484
5485         /* If we have the recommended number of TLB entries (16), fine. */
5486         if (vtisochctrl == 0x10)
5487                 return;
5488
5489         /* Zero TLB entries? You get to ride the short bus to school. */
5490         if (!vtisochctrl) {
5491                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5492                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5493                      dmi_get_system_info(DMI_BIOS_VENDOR),
5494                      dmi_get_system_info(DMI_BIOS_VERSION),
5495                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5496                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5497                 return;
5498         }
5499
5500         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5501                vtisochctrl);
5502 }