GNU Linux-libre 4.4.288-gnu1
[releases.git] / drivers / vfio / vfio_iommu_spapr_tce.c
1 /*
2  * VFIO: IOMMU DMA mapping support for TCE on POWER
3  *
4  * Copyright (C) 2013 IBM Corp.  All rights reserved.
5  *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  *
11  * Derived from original vfio_iommu_type1.c:
12  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
13  *     Author: Alex Williamson <alex.williamson@redhat.com>
14  */
15
16 #include <linux/module.h>
17 #include <linux/pci.h>
18 #include <linux/slab.h>
19 #include <linux/uaccess.h>
20 #include <linux/err.h>
21 #include <linux/vfio.h>
22 #include <linux/vmalloc.h>
23 #include <asm/iommu.h>
24 #include <asm/tce.h>
25 #include <asm/mmu_context.h>
26
27 #define DRIVER_VERSION  "0.1"
28 #define DRIVER_AUTHOR   "aik@ozlabs.ru"
29 #define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
30
31 static void tce_iommu_detach_group(void *iommu_data,
32                 struct iommu_group *iommu_group);
33
34 static long try_increment_locked_vm(long npages)
35 {
36         long ret = 0, locked, lock_limit;
37
38         if (!current || !current->mm)
39                 return -ESRCH; /* process exited */
40
41         if (!npages)
42                 return 0;
43
44         down_write(&current->mm->mmap_sem);
45         locked = current->mm->locked_vm + npages;
46         lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
47         if (locked > lock_limit && !capable(CAP_IPC_LOCK))
48                 ret = -ENOMEM;
49         else
50                 current->mm->locked_vm += npages;
51
52         pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
53                         npages << PAGE_SHIFT,
54                         current->mm->locked_vm << PAGE_SHIFT,
55                         rlimit(RLIMIT_MEMLOCK),
56                         ret ? " - exceeded" : "");
57
58         up_write(&current->mm->mmap_sem);
59
60         return ret;
61 }
62
63 static void decrement_locked_vm(long npages)
64 {
65         if (!current || !current->mm || !npages)
66                 return; /* process exited */
67
68         down_write(&current->mm->mmap_sem);
69         if (WARN_ON_ONCE(npages > current->mm->locked_vm))
70                 npages = current->mm->locked_vm;
71         current->mm->locked_vm -= npages;
72         pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
73                         npages << PAGE_SHIFT,
74                         current->mm->locked_vm << PAGE_SHIFT,
75                         rlimit(RLIMIT_MEMLOCK));
76         up_write(&current->mm->mmap_sem);
77 }
78
79 /*
80  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
81  *
82  * This code handles mapping and unmapping of user data buffers
83  * into DMA'ble space using the IOMMU
84  */
85
86 struct tce_iommu_group {
87         struct list_head next;
88         struct iommu_group *grp;
89 };
90
91 /*
92  * The container descriptor supports only a single group per container.
93  * Required by the API as the container is not supplied with the IOMMU group
94  * at the moment of initialization.
95  */
96 struct tce_container {
97         struct mutex lock;
98         bool enabled;
99         bool v2;
100         unsigned long locked_pages;
101         struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
102         struct list_head group_list;
103 };
104
105 static long tce_iommu_unregister_pages(struct tce_container *container,
106                 __u64 vaddr, __u64 size)
107 {
108         struct mm_iommu_table_group_mem_t *mem;
109
110         if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
111                 return -EINVAL;
112
113         mem = mm_iommu_find(vaddr, size >> PAGE_SHIFT);
114         if (!mem)
115                 return -ENOENT;
116
117         return mm_iommu_put(mem);
118 }
119
120 static long tce_iommu_register_pages(struct tce_container *container,
121                 __u64 vaddr, __u64 size)
122 {
123         long ret = 0;
124         struct mm_iommu_table_group_mem_t *mem = NULL;
125         unsigned long entries = size >> PAGE_SHIFT;
126
127         if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
128                         ((vaddr + size) < vaddr))
129                 return -EINVAL;
130
131         ret = mm_iommu_get(vaddr, entries, &mem);
132         if (ret)
133                 return ret;
134
135         container->enabled = true;
136
137         return 0;
138 }
139
140 static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl)
141 {
142         unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
143                         tbl->it_size, PAGE_SIZE);
144         unsigned long *uas;
145         long ret;
146
147         BUG_ON(tbl->it_userspace);
148
149         ret = try_increment_locked_vm(cb >> PAGE_SHIFT);
150         if (ret)
151                 return ret;
152
153         uas = vzalloc(cb);
154         if (!uas) {
155                 decrement_locked_vm(cb >> PAGE_SHIFT);
156                 return -ENOMEM;
157         }
158         tbl->it_userspace = uas;
159
160         return 0;
161 }
162
163 static void tce_iommu_userspace_view_free(struct iommu_table *tbl)
164 {
165         unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
166                         tbl->it_size, PAGE_SIZE);
167
168         if (!tbl->it_userspace)
169                 return;
170
171         vfree(tbl->it_userspace);
172         tbl->it_userspace = NULL;
173         decrement_locked_vm(cb >> PAGE_SHIFT);
174 }
175
176 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
177 {
178         /*
179          * Check that the TCE table granularity is not bigger than the size of
180          * a page we just found. Otherwise the hardware can get access to
181          * a bigger memory chunk that it should.
182          */
183         return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
184 }
185
186 static inline bool tce_groups_attached(struct tce_container *container)
187 {
188         return !list_empty(&container->group_list);
189 }
190
191 static long tce_iommu_find_table(struct tce_container *container,
192                 phys_addr_t ioba, struct iommu_table **ptbl)
193 {
194         long i;
195
196         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
197                 struct iommu_table *tbl = container->tables[i];
198
199                 if (tbl) {
200                         unsigned long entry = ioba >> tbl->it_page_shift;
201                         unsigned long start = tbl->it_offset;
202                         unsigned long end = start + tbl->it_size;
203
204                         if ((start <= entry) && (entry < end)) {
205                                 *ptbl = tbl;
206                                 return i;
207                         }
208                 }
209         }
210
211         return -1;
212 }
213
214 static int tce_iommu_find_free_table(struct tce_container *container)
215 {
216         int i;
217
218         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
219                 if (!container->tables[i])
220                         return i;
221         }
222
223         return -ENOSPC;
224 }
225
226 static int tce_iommu_enable(struct tce_container *container)
227 {
228         int ret = 0;
229         unsigned long locked;
230         struct iommu_table_group *table_group;
231         struct tce_iommu_group *tcegrp;
232
233         if (!current->mm)
234                 return -ESRCH; /* process exited */
235
236         if (container->enabled)
237                 return -EBUSY;
238
239         /*
240          * When userspace pages are mapped into the IOMMU, they are effectively
241          * locked memory, so, theoretically, we need to update the accounting
242          * of locked pages on each map and unmap.  For powerpc, the map unmap
243          * paths can be very hot, though, and the accounting would kill
244          * performance, especially since it would be difficult to impossible
245          * to handle the accounting in real mode only.
246          *
247          * To address that, rather than precisely accounting every page, we
248          * instead account for a worst case on locked memory when the iommu is
249          * enabled and disabled.  The worst case upper bound on locked memory
250          * is the size of the whole iommu window, which is usually relatively
251          * small (compared to total memory sizes) on POWER hardware.
252          *
253          * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
254          * that would effectively kill the guest at random points, much better
255          * enforcing the limit based on the max that the guest can map.
256          *
257          * Unfortunately at the moment it counts whole tables, no matter how
258          * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
259          * each with 2GB DMA window, 8GB will be counted here. The reason for
260          * this is that we cannot tell here the amount of RAM used by the guest
261          * as this information is only available from KVM and VFIO is
262          * KVM agnostic.
263          *
264          * So we do not allow enabling a container without a group attached
265          * as there is no way to know how much we should increment
266          * the locked_vm counter.
267          */
268         if (!tce_groups_attached(container))
269                 return -ENODEV;
270
271         tcegrp = list_first_entry(&container->group_list,
272                         struct tce_iommu_group, next);
273         table_group = iommu_group_get_iommudata(tcegrp->grp);
274         if (!table_group)
275                 return -ENODEV;
276
277         if (!table_group->tce32_size)
278                 return -EPERM;
279
280         locked = table_group->tce32_size >> PAGE_SHIFT;
281         ret = try_increment_locked_vm(locked);
282         if (ret)
283                 return ret;
284
285         container->locked_pages = locked;
286
287         container->enabled = true;
288
289         return ret;
290 }
291
292 static void tce_iommu_disable(struct tce_container *container)
293 {
294         if (!container->enabled)
295                 return;
296
297         container->enabled = false;
298
299         if (!current->mm)
300                 return;
301
302         decrement_locked_vm(container->locked_pages);
303 }
304
305 static void *tce_iommu_open(unsigned long arg)
306 {
307         struct tce_container *container;
308
309         if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
310                 pr_err("tce_vfio: Wrong IOMMU type\n");
311                 return ERR_PTR(-EINVAL);
312         }
313
314         container = kzalloc(sizeof(*container), GFP_KERNEL);
315         if (!container)
316                 return ERR_PTR(-ENOMEM);
317
318         mutex_init(&container->lock);
319         INIT_LIST_HEAD_RCU(&container->group_list);
320
321         container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
322
323         return container;
324 }
325
326 static int tce_iommu_clear(struct tce_container *container,
327                 struct iommu_table *tbl,
328                 unsigned long entry, unsigned long pages);
329 static void tce_iommu_free_table(struct iommu_table *tbl);
330
331 static void tce_iommu_release(void *iommu_data)
332 {
333         struct tce_container *container = iommu_data;
334         struct iommu_table_group *table_group;
335         struct tce_iommu_group *tcegrp;
336         long i;
337
338         while (tce_groups_attached(container)) {
339                 tcegrp = list_first_entry(&container->group_list,
340                                 struct tce_iommu_group, next);
341                 table_group = iommu_group_get_iommudata(tcegrp->grp);
342                 tce_iommu_detach_group(iommu_data, tcegrp->grp);
343         }
344
345         /*
346          * If VFIO created a table, it was not disposed
347          * by tce_iommu_detach_group() so do it now.
348          */
349         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
350                 struct iommu_table *tbl = container->tables[i];
351
352                 if (!tbl)
353                         continue;
354
355                 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
356                 tce_iommu_free_table(tbl);
357         }
358
359         tce_iommu_disable(container);
360         mutex_destroy(&container->lock);
361
362         kfree(container);
363 }
364
365 static void tce_iommu_unuse_page(struct tce_container *container,
366                 unsigned long hpa)
367 {
368         struct page *page;
369
370         page = pfn_to_page(hpa >> PAGE_SHIFT);
371         put_page(page);
372 }
373
374 static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size,
375                 unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
376 {
377         long ret = 0;
378         struct mm_iommu_table_group_mem_t *mem;
379
380         mem = mm_iommu_lookup(tce, size);
381         if (!mem)
382                 return -EINVAL;
383
384         ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
385         if (ret)
386                 return -EINVAL;
387
388         *pmem = mem;
389
390         return 0;
391 }
392
393 static void tce_iommu_unuse_page_v2(struct iommu_table *tbl,
394                 unsigned long entry)
395 {
396         struct mm_iommu_table_group_mem_t *mem = NULL;
397         int ret;
398         unsigned long hpa = 0;
399         unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
400
401         if (!pua || !current || !current->mm)
402                 return;
403
404         ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl),
405                         &hpa, &mem);
406         if (ret)
407                 pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
408                                 __func__, *pua, entry, ret);
409         if (mem)
410                 mm_iommu_mapped_dec(mem);
411
412         *pua = 0;
413 }
414
415 static int tce_iommu_clear(struct tce_container *container,
416                 struct iommu_table *tbl,
417                 unsigned long entry, unsigned long pages)
418 {
419         unsigned long oldhpa;
420         long ret;
421         enum dma_data_direction direction;
422
423         for ( ; pages; --pages, ++entry) {
424                 direction = DMA_NONE;
425                 oldhpa = 0;
426                 ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
427                 if (ret)
428                         continue;
429
430                 if (direction == DMA_NONE)
431                         continue;
432
433                 if (container->v2) {
434                         tce_iommu_unuse_page_v2(tbl, entry);
435                         continue;
436                 }
437
438                 tce_iommu_unuse_page(container, oldhpa);
439         }
440
441         return 0;
442 }
443
444 static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
445 {
446         struct page *page = NULL;
447         enum dma_data_direction direction = iommu_tce_direction(tce);
448
449         if (get_user_pages_fast(tce & PAGE_MASK, 1,
450                         direction != DMA_TO_DEVICE, &page) != 1)
451                 return -EFAULT;
452
453         *hpa = __pa((unsigned long) page_address(page));
454
455         return 0;
456 }
457
458 static long tce_iommu_build(struct tce_container *container,
459                 struct iommu_table *tbl,
460                 unsigned long entry, unsigned long tce, unsigned long pages,
461                 enum dma_data_direction direction)
462 {
463         long i, ret = 0;
464         struct page *page;
465         unsigned long hpa;
466         enum dma_data_direction dirtmp;
467
468         for (i = 0; i < pages; ++i) {
469                 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
470
471                 ret = tce_iommu_use_page(tce, &hpa);
472                 if (ret)
473                         break;
474
475                 page = pfn_to_page(hpa >> PAGE_SHIFT);
476                 if (!tce_page_is_contained(page, tbl->it_page_shift)) {
477                         ret = -EPERM;
478                         break;
479                 }
480
481                 hpa |= offset;
482                 dirtmp = direction;
483                 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
484                 if (ret) {
485                         tce_iommu_unuse_page(container, hpa);
486                         pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
487                                         __func__, entry << tbl->it_page_shift,
488                                         tce, ret);
489                         break;
490                 }
491
492                 if (dirtmp != DMA_NONE)
493                         tce_iommu_unuse_page(container, hpa);
494
495                 tce += IOMMU_PAGE_SIZE(tbl);
496         }
497
498         if (ret)
499                 tce_iommu_clear(container, tbl, entry, i);
500
501         return ret;
502 }
503
504 static long tce_iommu_build_v2(struct tce_container *container,
505                 struct iommu_table *tbl,
506                 unsigned long entry, unsigned long tce, unsigned long pages,
507                 enum dma_data_direction direction)
508 {
509         long i, ret = 0;
510         struct page *page;
511         unsigned long hpa;
512         enum dma_data_direction dirtmp;
513
514         if (!tbl->it_userspace) {
515                 ret = tce_iommu_userspace_view_alloc(tbl);
516                 if (ret)
517                         return ret;
518         }
519
520         for (i = 0; i < pages; ++i) {
521                 struct mm_iommu_table_group_mem_t *mem = NULL;
522                 unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
523                                 entry + i);
524
525                 ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl),
526                                 &hpa, &mem);
527                 if (ret)
528                         break;
529
530                 page = pfn_to_page(hpa >> PAGE_SHIFT);
531                 if (!tce_page_is_contained(page, tbl->it_page_shift)) {
532                         ret = -EPERM;
533                         break;
534                 }
535
536                 /* Preserve offset within IOMMU page */
537                 hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
538                 dirtmp = direction;
539
540                 /* The registered region is being unregistered */
541                 if (mm_iommu_mapped_inc(mem))
542                         break;
543
544                 ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
545                 if (ret) {
546                         /* dirtmp cannot be DMA_NONE here */
547                         tce_iommu_unuse_page_v2(tbl, entry + i);
548                         pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
549                                         __func__, entry << tbl->it_page_shift,
550                                         tce, ret);
551                         break;
552                 }
553
554                 if (dirtmp != DMA_NONE)
555                         tce_iommu_unuse_page_v2(tbl, entry + i);
556
557                 *pua = tce;
558
559                 tce += IOMMU_PAGE_SIZE(tbl);
560         }
561
562         if (ret)
563                 tce_iommu_clear(container, tbl, entry, i);
564
565         return ret;
566 }
567
568 static long tce_iommu_create_table(struct tce_container *container,
569                         struct iommu_table_group *table_group,
570                         int num,
571                         __u32 page_shift,
572                         __u64 window_size,
573                         __u32 levels,
574                         struct iommu_table **ptbl)
575 {
576         long ret, table_size;
577
578         table_size = table_group->ops->get_table_size(page_shift, window_size,
579                         levels);
580         if (!table_size)
581                 return -EINVAL;
582
583         ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
584         if (ret)
585                 return ret;
586
587         ret = table_group->ops->create_table(table_group, num,
588                         page_shift, window_size, levels, ptbl);
589
590         WARN_ON(!ret && !(*ptbl)->it_ops->free);
591         WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
592
593         return ret;
594 }
595
596 static void tce_iommu_free_table(struct iommu_table *tbl)
597 {
598         unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
599
600         tce_iommu_userspace_view_free(tbl);
601         tbl->it_ops->free(tbl);
602         decrement_locked_vm(pages);
603 }
604
605 static long tce_iommu_create_window(struct tce_container *container,
606                 __u32 page_shift, __u64 window_size, __u32 levels,
607                 __u64 *start_addr)
608 {
609         struct tce_iommu_group *tcegrp;
610         struct iommu_table_group *table_group;
611         struct iommu_table *tbl = NULL;
612         long ret, num;
613
614         num = tce_iommu_find_free_table(container);
615         if (num < 0)
616                 return num;
617
618         /* Get the first group for ops::create_table */
619         tcegrp = list_first_entry(&container->group_list,
620                         struct tce_iommu_group, next);
621         table_group = iommu_group_get_iommudata(tcegrp->grp);
622         if (!table_group)
623                 return -EFAULT;
624
625         if (!(table_group->pgsizes & (1ULL << page_shift)))
626                 return -EINVAL;
627
628         if (!table_group->ops->set_window || !table_group->ops->unset_window ||
629                         !table_group->ops->get_table_size ||
630                         !table_group->ops->create_table)
631                 return -EPERM;
632
633         /* Create TCE table */
634         ret = tce_iommu_create_table(container, table_group, num,
635                         page_shift, window_size, levels, &tbl);
636         if (ret)
637                 return ret;
638
639         BUG_ON(!tbl->it_ops->free);
640
641         /*
642          * Program the table to every group.
643          * Groups have been tested for compatibility at the attach time.
644          */
645         list_for_each_entry(tcegrp, &container->group_list, next) {
646                 table_group = iommu_group_get_iommudata(tcegrp->grp);
647
648                 ret = table_group->ops->set_window(table_group, num, tbl);
649                 if (ret)
650                         goto unset_exit;
651         }
652
653         container->tables[num] = tbl;
654
655         /* Return start address assigned by platform in create_table() */
656         *start_addr = tbl->it_offset << tbl->it_page_shift;
657
658         return 0;
659
660 unset_exit:
661         list_for_each_entry(tcegrp, &container->group_list, next) {
662                 table_group = iommu_group_get_iommudata(tcegrp->grp);
663                 table_group->ops->unset_window(table_group, num);
664         }
665         tce_iommu_free_table(tbl);
666
667         return ret;
668 }
669
670 static long tce_iommu_remove_window(struct tce_container *container,
671                 __u64 start_addr)
672 {
673         struct iommu_table_group *table_group = NULL;
674         struct iommu_table *tbl;
675         struct tce_iommu_group *tcegrp;
676         int num;
677
678         num = tce_iommu_find_table(container, start_addr, &tbl);
679         if (num < 0)
680                 return -EINVAL;
681
682         BUG_ON(!tbl->it_size);
683
684         /* Detach groups from IOMMUs */
685         list_for_each_entry(tcegrp, &container->group_list, next) {
686                 table_group = iommu_group_get_iommudata(tcegrp->grp);
687
688                 /*
689                  * SPAPR TCE IOMMU exposes the default DMA window to
690                  * the guest via dma32_window_start/size of
691                  * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
692                  * the userspace to remove this window, some do not so
693                  * here we check for the platform capability.
694                  */
695                 if (!table_group->ops || !table_group->ops->unset_window)
696                         return -EPERM;
697
698                 table_group->ops->unset_window(table_group, num);
699         }
700
701         /* Free table */
702         tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
703         tce_iommu_free_table(tbl);
704         container->tables[num] = NULL;
705
706         return 0;
707 }
708
709 static long tce_iommu_ioctl(void *iommu_data,
710                                  unsigned int cmd, unsigned long arg)
711 {
712         struct tce_container *container = iommu_data;
713         unsigned long minsz, ddwsz;
714         long ret;
715
716         switch (cmd) {
717         case VFIO_CHECK_EXTENSION:
718                 switch (arg) {
719                 case VFIO_SPAPR_TCE_IOMMU:
720                 case VFIO_SPAPR_TCE_v2_IOMMU:
721                         ret = 1;
722                         break;
723                 default:
724                         ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
725                         break;
726                 }
727
728                 return (ret < 0) ? 0 : ret;
729
730         case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
731                 struct vfio_iommu_spapr_tce_info info;
732                 struct tce_iommu_group *tcegrp;
733                 struct iommu_table_group *table_group;
734
735                 if (!tce_groups_attached(container))
736                         return -ENXIO;
737
738                 tcegrp = list_first_entry(&container->group_list,
739                                 struct tce_iommu_group, next);
740                 table_group = iommu_group_get_iommudata(tcegrp->grp);
741
742                 if (!table_group)
743                         return -ENXIO;
744
745                 minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
746                                 dma32_window_size);
747
748                 if (copy_from_user(&info, (void __user *)arg, minsz))
749                         return -EFAULT;
750
751                 if (info.argsz < minsz)
752                         return -EINVAL;
753
754                 info.dma32_window_start = table_group->tce32_start;
755                 info.dma32_window_size = table_group->tce32_size;
756                 info.flags = 0;
757                 memset(&info.ddw, 0, sizeof(info.ddw));
758
759                 if (table_group->max_dynamic_windows_supported &&
760                                 container->v2) {
761                         info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
762                         info.ddw.pgsizes = table_group->pgsizes;
763                         info.ddw.max_dynamic_windows_supported =
764                                 table_group->max_dynamic_windows_supported;
765                         info.ddw.levels = table_group->max_levels;
766                 }
767
768                 ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
769
770                 if (info.argsz >= ddwsz)
771                         minsz = ddwsz;
772
773                 if (copy_to_user((void __user *)arg, &info, minsz))
774                         return -EFAULT;
775
776                 return 0;
777         }
778         case VFIO_IOMMU_MAP_DMA: {
779                 struct vfio_iommu_type1_dma_map param;
780                 struct iommu_table *tbl = NULL;
781                 long num;
782                 enum dma_data_direction direction;
783
784                 if (!container->enabled)
785                         return -EPERM;
786
787                 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
788
789                 if (copy_from_user(&param, (void __user *)arg, minsz))
790                         return -EFAULT;
791
792                 if (param.argsz < minsz)
793                         return -EINVAL;
794
795                 if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
796                                 VFIO_DMA_MAP_FLAG_WRITE))
797                         return -EINVAL;
798
799                 num = tce_iommu_find_table(container, param.iova, &tbl);
800                 if (num < 0)
801                         return -ENXIO;
802
803                 if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
804                                 (param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
805                         return -EINVAL;
806
807                 /* iova is checked by the IOMMU API */
808                 if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
809                         if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
810                                 direction = DMA_BIDIRECTIONAL;
811                         else
812                                 direction = DMA_TO_DEVICE;
813                 } else {
814                         if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
815                                 direction = DMA_FROM_DEVICE;
816                         else
817                                 return -EINVAL;
818                 }
819
820                 ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
821                 if (ret)
822                         return ret;
823
824                 if (container->v2)
825                         ret = tce_iommu_build_v2(container, tbl,
826                                         param.iova >> tbl->it_page_shift,
827                                         param.vaddr,
828                                         param.size >> tbl->it_page_shift,
829                                         direction);
830                 else
831                         ret = tce_iommu_build(container, tbl,
832                                         param.iova >> tbl->it_page_shift,
833                                         param.vaddr,
834                                         param.size >> tbl->it_page_shift,
835                                         direction);
836
837                 iommu_flush_tce(tbl);
838
839                 return ret;
840         }
841         case VFIO_IOMMU_UNMAP_DMA: {
842                 struct vfio_iommu_type1_dma_unmap param;
843                 struct iommu_table *tbl = NULL;
844                 long num;
845
846                 if (!container->enabled)
847                         return -EPERM;
848
849                 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
850                                 size);
851
852                 if (copy_from_user(&param, (void __user *)arg, minsz))
853                         return -EFAULT;
854
855                 if (param.argsz < minsz)
856                         return -EINVAL;
857
858                 /* No flag is supported now */
859                 if (param.flags)
860                         return -EINVAL;
861
862                 num = tce_iommu_find_table(container, param.iova, &tbl);
863                 if (num < 0)
864                         return -ENXIO;
865
866                 if (param.size & ~IOMMU_PAGE_MASK(tbl))
867                         return -EINVAL;
868
869                 ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
870                                 param.size >> tbl->it_page_shift);
871                 if (ret)
872                         return ret;
873
874                 ret = tce_iommu_clear(container, tbl,
875                                 param.iova >> tbl->it_page_shift,
876                                 param.size >> tbl->it_page_shift);
877                 iommu_flush_tce(tbl);
878
879                 return ret;
880         }
881         case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
882                 struct vfio_iommu_spapr_register_memory param;
883
884                 if (!container->v2)
885                         break;
886
887                 minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
888                                 size);
889
890                 if (copy_from_user(&param, (void __user *)arg, minsz))
891                         return -EFAULT;
892
893                 if (param.argsz < minsz)
894                         return -EINVAL;
895
896                 /* No flag is supported now */
897                 if (param.flags)
898                         return -EINVAL;
899
900                 mutex_lock(&container->lock);
901                 ret = tce_iommu_register_pages(container, param.vaddr,
902                                 param.size);
903                 mutex_unlock(&container->lock);
904
905                 return ret;
906         }
907         case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
908                 struct vfio_iommu_spapr_register_memory param;
909
910                 if (!container->v2)
911                         break;
912
913                 minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
914                                 size);
915
916                 if (copy_from_user(&param, (void __user *)arg, minsz))
917                         return -EFAULT;
918
919                 if (param.argsz < minsz)
920                         return -EINVAL;
921
922                 /* No flag is supported now */
923                 if (param.flags)
924                         return -EINVAL;
925
926                 mutex_lock(&container->lock);
927                 ret = tce_iommu_unregister_pages(container, param.vaddr,
928                                 param.size);
929                 mutex_unlock(&container->lock);
930
931                 return ret;
932         }
933         case VFIO_IOMMU_ENABLE:
934                 if (container->v2)
935                         break;
936
937                 mutex_lock(&container->lock);
938                 ret = tce_iommu_enable(container);
939                 mutex_unlock(&container->lock);
940                 return ret;
941
942
943         case VFIO_IOMMU_DISABLE:
944                 if (container->v2)
945                         break;
946
947                 mutex_lock(&container->lock);
948                 tce_iommu_disable(container);
949                 mutex_unlock(&container->lock);
950                 return 0;
951
952         case VFIO_EEH_PE_OP: {
953                 struct tce_iommu_group *tcegrp;
954
955                 ret = 0;
956                 list_for_each_entry(tcegrp, &container->group_list, next) {
957                         ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
958                                         cmd, arg);
959                         if (ret)
960                                 return ret;
961                 }
962                 return ret;
963         }
964
965         case VFIO_IOMMU_SPAPR_TCE_CREATE: {
966                 struct vfio_iommu_spapr_tce_create create;
967
968                 if (!container->v2)
969                         break;
970
971                 if (!tce_groups_attached(container))
972                         return -ENXIO;
973
974                 minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
975                                 start_addr);
976
977                 if (copy_from_user(&create, (void __user *)arg, minsz))
978                         return -EFAULT;
979
980                 if (create.argsz < minsz)
981                         return -EINVAL;
982
983                 if (create.flags)
984                         return -EINVAL;
985
986                 mutex_lock(&container->lock);
987
988                 ret = tce_iommu_create_window(container, create.page_shift,
989                                 create.window_size, create.levels,
990                                 &create.start_addr);
991
992                 mutex_unlock(&container->lock);
993
994                 if (!ret && copy_to_user((void __user *)arg, &create, minsz))
995                         ret = -EFAULT;
996
997                 return ret;
998         }
999         case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1000                 struct vfio_iommu_spapr_tce_remove remove;
1001
1002                 if (!container->v2)
1003                         break;
1004
1005                 if (!tce_groups_attached(container))
1006                         return -ENXIO;
1007
1008                 minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1009                                 start_addr);
1010
1011                 if (copy_from_user(&remove, (void __user *)arg, minsz))
1012                         return -EFAULT;
1013
1014                 if (remove.argsz < minsz)
1015                         return -EINVAL;
1016
1017                 if (remove.flags)
1018                         return -EINVAL;
1019
1020                 mutex_lock(&container->lock);
1021
1022                 ret = tce_iommu_remove_window(container, remove.start_addr);
1023
1024                 mutex_unlock(&container->lock);
1025
1026                 return ret;
1027         }
1028         }
1029
1030         return -ENOTTY;
1031 }
1032
1033 static void tce_iommu_release_ownership(struct tce_container *container,
1034                 struct iommu_table_group *table_group)
1035 {
1036         int i;
1037
1038         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1039                 struct iommu_table *tbl = container->tables[i];
1040
1041                 if (!tbl)
1042                         continue;
1043
1044                 tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1045                 tce_iommu_userspace_view_free(tbl);
1046                 if (tbl->it_map)
1047                         iommu_release_ownership(tbl);
1048
1049                 container->tables[i] = NULL;
1050         }
1051 }
1052
1053 static int tce_iommu_take_ownership(struct tce_container *container,
1054                 struct iommu_table_group *table_group)
1055 {
1056         int i, j, rc = 0;
1057
1058         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1059                 struct iommu_table *tbl = table_group->tables[i];
1060
1061                 if (!tbl || !tbl->it_map)
1062                         continue;
1063
1064                 rc = iommu_take_ownership(tbl);
1065                 if (rc) {
1066                         for (j = 0; j < i; ++j)
1067                                 iommu_release_ownership(
1068                                                 table_group->tables[j]);
1069
1070                         return rc;
1071                 }
1072         }
1073
1074         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1075                 container->tables[i] = table_group->tables[i];
1076
1077         return 0;
1078 }
1079
1080 static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1081                 struct iommu_table_group *table_group)
1082 {
1083         long i;
1084
1085         if (!table_group->ops->unset_window) {
1086                 WARN_ON_ONCE(1);
1087                 return;
1088         }
1089
1090         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1091                 table_group->ops->unset_window(table_group, i);
1092
1093         table_group->ops->release_ownership(table_group);
1094 }
1095
1096 static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1097                 struct iommu_table_group *table_group)
1098 {
1099         long i, ret = 0;
1100         struct iommu_table *tbl = NULL;
1101
1102         if (!table_group->ops->create_table || !table_group->ops->set_window ||
1103                         !table_group->ops->release_ownership) {
1104                 WARN_ON_ONCE(1);
1105                 return -EFAULT;
1106         }
1107
1108         table_group->ops->take_ownership(table_group);
1109
1110         /*
1111          * If it the first group attached, check if there is
1112          * a default DMA window and create one if none as
1113          * the userspace expects it to exist.
1114          */
1115         if (!tce_groups_attached(container) && !container->tables[0]) {
1116                 ret = tce_iommu_create_table(container,
1117                                 table_group,
1118                                 0, /* window number */
1119                                 IOMMU_PAGE_SHIFT_4K,
1120                                 table_group->tce32_size,
1121                                 1, /* default levels */
1122                                 &tbl);
1123                 if (ret)
1124                         goto release_exit;
1125                 else
1126                         container->tables[0] = tbl;
1127         }
1128
1129         /* Set all windows to the new group */
1130         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1131                 tbl = container->tables[i];
1132
1133                 if (!tbl)
1134                         continue;
1135
1136                 /* Set the default window to a new group */
1137                 ret = table_group->ops->set_window(table_group, i, tbl);
1138                 if (ret)
1139                         goto release_exit;
1140         }
1141
1142         return 0;
1143
1144 release_exit:
1145         for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1146                 table_group->ops->unset_window(table_group, i);
1147
1148         table_group->ops->release_ownership(table_group);
1149
1150         return ret;
1151 }
1152
1153 static int tce_iommu_attach_group(void *iommu_data,
1154                 struct iommu_group *iommu_group)
1155 {
1156         int ret;
1157         struct tce_container *container = iommu_data;
1158         struct iommu_table_group *table_group;
1159         struct tce_iommu_group *tcegrp = NULL;
1160
1161         mutex_lock(&container->lock);
1162
1163         /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1164                         iommu_group_id(iommu_group), iommu_group); */
1165         table_group = iommu_group_get_iommudata(iommu_group);
1166         if (!table_group) {
1167                 ret = -ENODEV;
1168                 goto unlock_exit;
1169         }
1170
1171         if (tce_groups_attached(container) && (!table_group->ops ||
1172                         !table_group->ops->take_ownership ||
1173                         !table_group->ops->release_ownership)) {
1174                 ret = -EBUSY;
1175                 goto unlock_exit;
1176         }
1177
1178         /* Check if new group has the same iommu_ops (i.e. compatible) */
1179         list_for_each_entry(tcegrp, &container->group_list, next) {
1180                 struct iommu_table_group *table_group_tmp;
1181
1182                 if (tcegrp->grp == iommu_group) {
1183                         pr_warn("tce_vfio: Group %d is already attached\n",
1184                                         iommu_group_id(iommu_group));
1185                         ret = -EBUSY;
1186                         goto unlock_exit;
1187                 }
1188                 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1189                 if (table_group_tmp->ops != table_group->ops) {
1190                         pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1191                                         iommu_group_id(iommu_group),
1192                                         iommu_group_id(tcegrp->grp));
1193                         ret = -EPERM;
1194                         goto unlock_exit;
1195                 }
1196         }
1197
1198         tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1199         if (!tcegrp) {
1200                 ret = -ENOMEM;
1201                 goto unlock_exit;
1202         }
1203
1204         if (!table_group->ops || !table_group->ops->take_ownership ||
1205                         !table_group->ops->release_ownership)
1206                 ret = tce_iommu_take_ownership(container, table_group);
1207         else
1208                 ret = tce_iommu_take_ownership_ddw(container, table_group);
1209
1210         if (!ret) {
1211                 tcegrp->grp = iommu_group;
1212                 list_add(&tcegrp->next, &container->group_list);
1213         }
1214
1215 unlock_exit:
1216         if (ret && tcegrp)
1217                 kfree(tcegrp);
1218
1219         mutex_unlock(&container->lock);
1220
1221         return ret;
1222 }
1223
1224 static void tce_iommu_detach_group(void *iommu_data,
1225                 struct iommu_group *iommu_group)
1226 {
1227         struct tce_container *container = iommu_data;
1228         struct iommu_table_group *table_group;
1229         bool found = false;
1230         struct tce_iommu_group *tcegrp;
1231
1232         mutex_lock(&container->lock);
1233
1234         list_for_each_entry(tcegrp, &container->group_list, next) {
1235                 if (tcegrp->grp == iommu_group) {
1236                         found = true;
1237                         break;
1238                 }
1239         }
1240
1241         if (!found) {
1242                 pr_warn("tce_vfio: detaching unattached group #%u\n",
1243                                 iommu_group_id(iommu_group));
1244                 goto unlock_exit;
1245         }
1246
1247         list_del(&tcegrp->next);
1248         kfree(tcegrp);
1249
1250         table_group = iommu_group_get_iommudata(iommu_group);
1251         BUG_ON(!table_group);
1252
1253         if (!table_group->ops || !table_group->ops->release_ownership)
1254                 tce_iommu_release_ownership(container, table_group);
1255         else
1256                 tce_iommu_release_ownership_ddw(container, table_group);
1257
1258 unlock_exit:
1259         mutex_unlock(&container->lock);
1260 }
1261
1262 const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1263         .name           = "iommu-vfio-powerpc",
1264         .owner          = THIS_MODULE,
1265         .open           = tce_iommu_open,
1266         .release        = tce_iommu_release,
1267         .ioctl          = tce_iommu_ioctl,
1268         .attach_group   = tce_iommu_attach_group,
1269         .detach_group   = tce_iommu_detach_group,
1270 };
1271
1272 static int __init tce_iommu_init(void)
1273 {
1274         return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1275 }
1276
1277 static void __exit tce_iommu_cleanup(void)
1278 {
1279         vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1280 }
1281
1282 module_init(tce_iommu_init);
1283 module_exit(tce_iommu_cleanup);
1284
1285 MODULE_VERSION(DRIVER_VERSION);
1286 MODULE_LICENSE("GPL v2");
1287 MODULE_AUTHOR(DRIVER_AUTHOR);
1288 MODULE_DESCRIPTION(DRIVER_DESC);
1289