GNU Linux-libre 4.19.286-gnu1
[releases.git] / drivers / misc / vmw_balloon.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * VMware Balloon driver.
4  *
5  * Copyright (C) 2000-2018, VMware, Inc. All Rights Reserved.
6  *
7  * This is VMware physical memory management driver for Linux. The driver
8  * acts like a "balloon" that can be inflated to reclaim physical pages by
9  * reserving them in the guest and invalidating them in the monitor,
10  * freeing up the underlying machine pages so they can be allocated to
11  * other guests.  The balloon can also be deflated to allow the guest to
12  * use more physical memory. Higher level policies can control the sizes
13  * of balloons in VMs in order to manage physical memory resources.
14  */
15
16 //#define DEBUG
17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/mm.h>
22 #include <linux/vmalloc.h>
23 #include <linux/sched.h>
24 #include <linux/module.h>
25 #include <linux/workqueue.h>
26 #include <linux/debugfs.h>
27 #include <linux/seq_file.h>
28 #include <linux/vmw_vmci_defs.h>
29 #include <linux/vmw_vmci_api.h>
30 #include <asm/hypervisor.h>
31
32 MODULE_AUTHOR("VMware, Inc.");
33 MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
34 MODULE_VERSION("1.5.0.0-k");
35 MODULE_ALIAS("dmi:*:svnVMware*:*");
36 MODULE_ALIAS("vmware_vmmemctl");
37 MODULE_LICENSE("GPL");
38
39 /*
40  * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
41  * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use
42  * __GFP_NOWARN, to suppress page allocation failure warnings.
43  */
44 #define VMW_PAGE_ALLOC_NOSLEEP          (__GFP_HIGHMEM|__GFP_NOWARN)
45
46 /*
47  * Use GFP_HIGHUSER when executing in a separate kernel thread
48  * context and allocation can sleep.  This is less stressful to
49  * the guest memory system, since it allows the thread to block
50  * while memory is reclaimed, and won't take pages from emergency
51  * low-memory pools.
52  */
53 #define VMW_PAGE_ALLOC_CANSLEEP         (GFP_HIGHUSER)
54
55 /* Maximum number of refused pages we accumulate during inflation cycle */
56 #define VMW_BALLOON_MAX_REFUSED         16
57
58 /*
59  * Hypervisor communication port definitions.
60  */
61 #define VMW_BALLOON_HV_PORT             0x5670
62 #define VMW_BALLOON_HV_MAGIC            0x456c6d6f
63 #define VMW_BALLOON_GUEST_ID            1       /* Linux */
64
65 enum vmwballoon_capabilities {
66         /*
67          * Bit 0 is reserved and not associated to any capability.
68          */
69         VMW_BALLOON_BASIC_CMDS                  = (1 << 1),
70         VMW_BALLOON_BATCHED_CMDS                = (1 << 2),
71         VMW_BALLOON_BATCHED_2M_CMDS             = (1 << 3),
72         VMW_BALLOON_SIGNALLED_WAKEUP_CMD        = (1 << 4),
73 };
74
75 #define VMW_BALLOON_CAPABILITIES        (VMW_BALLOON_BASIC_CMDS \
76                                         | VMW_BALLOON_BATCHED_CMDS \
77                                         | VMW_BALLOON_BATCHED_2M_CMDS \
78                                         | VMW_BALLOON_SIGNALLED_WAKEUP_CMD)
79
80 #define VMW_BALLOON_2M_SHIFT            (9)
81 #define VMW_BALLOON_NUM_PAGE_SIZES      (2)
82
83 /*
84  * Backdoor commands availability:
85  *
86  * START, GET_TARGET and GUEST_ID are always available,
87  *
88  * VMW_BALLOON_BASIC_CMDS:
89  *      LOCK and UNLOCK commands,
90  * VMW_BALLOON_BATCHED_CMDS:
91  *      BATCHED_LOCK and BATCHED_UNLOCK commands.
92  * VMW BALLOON_BATCHED_2M_CMDS:
93  *      BATCHED_2M_LOCK and BATCHED_2M_UNLOCK commands,
94  * VMW VMW_BALLOON_SIGNALLED_WAKEUP_CMD:
95  *      VMW_BALLOON_CMD_VMCI_DOORBELL_SET command.
96  */
97 #define VMW_BALLOON_CMD_START                   0
98 #define VMW_BALLOON_CMD_GET_TARGET              1
99 #define VMW_BALLOON_CMD_LOCK                    2
100 #define VMW_BALLOON_CMD_UNLOCK                  3
101 #define VMW_BALLOON_CMD_GUEST_ID                4
102 #define VMW_BALLOON_CMD_BATCHED_LOCK            6
103 #define VMW_BALLOON_CMD_BATCHED_UNLOCK          7
104 #define VMW_BALLOON_CMD_BATCHED_2M_LOCK         8
105 #define VMW_BALLOON_CMD_BATCHED_2M_UNLOCK       9
106 #define VMW_BALLOON_CMD_VMCI_DOORBELL_SET       10
107
108
109 /* error codes */
110 #define VMW_BALLOON_SUCCESS                     0
111 #define VMW_BALLOON_FAILURE                     -1
112 #define VMW_BALLOON_ERROR_CMD_INVALID           1
113 #define VMW_BALLOON_ERROR_PPN_INVALID           2
114 #define VMW_BALLOON_ERROR_PPN_LOCKED            3
115 #define VMW_BALLOON_ERROR_PPN_UNLOCKED          4
116 #define VMW_BALLOON_ERROR_PPN_PINNED            5
117 #define VMW_BALLOON_ERROR_PPN_NOTNEEDED         6
118 #define VMW_BALLOON_ERROR_RESET                 7
119 #define VMW_BALLOON_ERROR_BUSY                  8
120
121 #define VMW_BALLOON_SUCCESS_WITH_CAPABILITIES   (0x03000000)
122
123 /* Batch page description */
124
125 /*
126  * Layout of a page in the batch page:
127  *
128  * +-------------+----------+--------+
129  * |             |          |        |
130  * | Page number | Reserved | Status |
131  * |             |          |        |
132  * +-------------+----------+--------+
133  * 64  PAGE_SHIFT          6         0
134  *
135  * The reserved field should be set to 0.
136  */
137 #define VMW_BALLOON_BATCH_MAX_PAGES     (PAGE_SIZE / sizeof(u64))
138 #define VMW_BALLOON_BATCH_STATUS_MASK   ((1UL << 5) - 1)
139 #define VMW_BALLOON_BATCH_PAGE_MASK     (~((1UL << PAGE_SHIFT) - 1))
140
141 struct vmballoon_batch_page {
142         u64 pages[VMW_BALLOON_BATCH_MAX_PAGES];
143 };
144
145 static u64 vmballoon_batch_get_pa(struct vmballoon_batch_page *batch, int idx)
146 {
147         return batch->pages[idx] & VMW_BALLOON_BATCH_PAGE_MASK;
148 }
149
150 static int vmballoon_batch_get_status(struct vmballoon_batch_page *batch,
151                                 int idx)
152 {
153         return (int)(batch->pages[idx] & VMW_BALLOON_BATCH_STATUS_MASK);
154 }
155
156 static void vmballoon_batch_set_pa(struct vmballoon_batch_page *batch, int idx,
157                                 u64 pa)
158 {
159         batch->pages[idx] = pa;
160 }
161
162
163 #define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result)             \
164 ({                                                              \
165         unsigned long __status, __dummy1, __dummy2, __dummy3;   \
166         __asm__ __volatile__ ("inl %%dx" :                      \
167                 "=a"(__status),                                 \
168                 "=c"(__dummy1),                                 \
169                 "=d"(__dummy2),                                 \
170                 "=b"(result),                                   \
171                 "=S" (__dummy3) :                               \
172                 "0"(VMW_BALLOON_HV_MAGIC),                      \
173                 "1"(VMW_BALLOON_CMD_##cmd),                     \
174                 "2"(VMW_BALLOON_HV_PORT),                       \
175                 "3"(arg1),                                      \
176                 "4" (arg2) :                                    \
177                 "memory");                                      \
178         if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START)     \
179                 result = __dummy1;                              \
180         result &= -1UL;                                         \
181         __status & -1UL;                                        \
182 })
183
184 #ifdef CONFIG_DEBUG_FS
185 struct vmballoon_stats {
186         unsigned int timer;
187         unsigned int doorbell;
188
189         /* allocation statistics */
190         unsigned int alloc[VMW_BALLOON_NUM_PAGE_SIZES];
191         unsigned int alloc_fail[VMW_BALLOON_NUM_PAGE_SIZES];
192         unsigned int sleep_alloc;
193         unsigned int sleep_alloc_fail;
194         unsigned int refused_alloc[VMW_BALLOON_NUM_PAGE_SIZES];
195         unsigned int refused_free[VMW_BALLOON_NUM_PAGE_SIZES];
196         unsigned int free[VMW_BALLOON_NUM_PAGE_SIZES];
197
198         /* monitor operations */
199         unsigned int lock[VMW_BALLOON_NUM_PAGE_SIZES];
200         unsigned int lock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
201         unsigned int unlock[VMW_BALLOON_NUM_PAGE_SIZES];
202         unsigned int unlock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
203         unsigned int target;
204         unsigned int target_fail;
205         unsigned int start;
206         unsigned int start_fail;
207         unsigned int guest_type;
208         unsigned int guest_type_fail;
209         unsigned int doorbell_set;
210         unsigned int doorbell_unset;
211 };
212
213 #define STATS_INC(stat) (stat)++
214 #else
215 #define STATS_INC(stat)
216 #endif
217
218 struct vmballoon;
219
220 struct vmballoon_ops {
221         void (*add_page)(struct vmballoon *b, int idx, struct page *p);
222         int (*lock)(struct vmballoon *b, unsigned int num_pages,
223                         bool is_2m_pages, unsigned int *target);
224         int (*unlock)(struct vmballoon *b, unsigned int num_pages,
225                         bool is_2m_pages, unsigned int *target);
226 };
227
228 struct vmballoon_page_size {
229         /* list of reserved physical pages */
230         struct list_head pages;
231
232         /* transient list of non-balloonable pages */
233         struct list_head refused_pages;
234         unsigned int n_refused_pages;
235 };
236
237 struct vmballoon {
238         struct vmballoon_page_size page_sizes[VMW_BALLOON_NUM_PAGE_SIZES];
239
240         /* supported page sizes. 1 == 4k pages only, 2 == 4k and 2m pages */
241         unsigned supported_page_sizes;
242
243         /* balloon size in pages */
244         unsigned int size;
245         unsigned int target;
246
247         /* reset flag */
248         bool reset_required;
249
250         unsigned long capabilities;
251
252         struct vmballoon_batch_page *batch_page;
253         unsigned int batch_max_pages;
254         struct page *page;
255
256         const struct vmballoon_ops *ops;
257
258 #ifdef CONFIG_DEBUG_FS
259         /* statistics */
260         struct vmballoon_stats stats;
261
262         /* debugfs file exporting statistics */
263         struct dentry *dbg_entry;
264 #endif
265
266         struct sysinfo sysinfo;
267
268         struct delayed_work dwork;
269
270         struct vmci_handle vmci_doorbell;
271 };
272
273 static struct vmballoon balloon;
274
275 /*
276  * Send "start" command to the host, communicating supported version
277  * of the protocol.
278  */
279 static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps)
280 {
281         unsigned long status, capabilities, dummy = 0;
282         bool success;
283
284         STATS_INC(b->stats.start);
285
286         status = VMWARE_BALLOON_CMD(START, req_caps, dummy, capabilities);
287
288         switch (status) {
289         case VMW_BALLOON_SUCCESS_WITH_CAPABILITIES:
290                 b->capabilities = capabilities;
291                 success = true;
292                 break;
293         case VMW_BALLOON_SUCCESS:
294                 b->capabilities = VMW_BALLOON_BASIC_CMDS;
295                 success = true;
296                 break;
297         default:
298                 success = false;
299         }
300
301         /*
302          * 2MB pages are only supported with batching. If batching is for some
303          * reason disabled, do not use 2MB pages, since otherwise the legacy
304          * mechanism is used with 2MB pages, causing a failure.
305          */
306         if ((b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS) &&
307             (b->capabilities & VMW_BALLOON_BATCHED_CMDS))
308                 b->supported_page_sizes = 2;
309         else
310                 b->supported_page_sizes = 1;
311
312         if (!success) {
313                 pr_debug("%s - failed, hv returns %ld\n", __func__, status);
314                 STATS_INC(b->stats.start_fail);
315         }
316         return success;
317 }
318
319 static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
320 {
321         switch (status) {
322         case VMW_BALLOON_SUCCESS:
323                 return true;
324
325         case VMW_BALLOON_ERROR_RESET:
326                 b->reset_required = true;
327                 /* fall through */
328
329         default:
330                 return false;
331         }
332 }
333
334 /*
335  * Communicate guest type to the host so that it can adjust ballooning
336  * algorithm to the one most appropriate for the guest. This command
337  * is normally issued after sending "start" command and is part of
338  * standard reset sequence.
339  */
340 static bool vmballoon_send_guest_id(struct vmballoon *b)
341 {
342         unsigned long status, dummy = 0;
343
344         status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy,
345                                 dummy);
346
347         STATS_INC(b->stats.guest_type);
348
349         if (vmballoon_check_status(b, status))
350                 return true;
351
352         pr_debug("%s - failed, hv returns %ld\n", __func__, status);
353         STATS_INC(b->stats.guest_type_fail);
354         return false;
355 }
356
357 static u16 vmballoon_page_size(bool is_2m_page)
358 {
359         if (is_2m_page)
360                 return 1 << VMW_BALLOON_2M_SHIFT;
361
362         return 1;
363 }
364
365 /*
366  * Retrieve desired balloon size from the host.
367  */
368 static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
369 {
370         unsigned long status;
371         unsigned long target;
372         unsigned long limit;
373         unsigned long dummy = 0;
374         u32 limit32;
375
376         /*
377          * si_meminfo() is cheap. Moreover, we want to provide dynamic
378          * max balloon size later. So let us call si_meminfo() every
379          * iteration.
380          */
381         si_meminfo(&b->sysinfo);
382         limit = b->sysinfo.totalram;
383
384         /* Ensure limit fits in 32-bits */
385         limit32 = (u32)limit;
386         if (limit != limit32)
387                 return false;
388
389         /* update stats */
390         STATS_INC(b->stats.target);
391
392         status = VMWARE_BALLOON_CMD(GET_TARGET, limit, dummy, target);
393         if (vmballoon_check_status(b, status)) {
394                 *new_target = target;
395                 return true;
396         }
397
398         pr_debug("%s - failed, hv returns %ld\n", __func__, status);
399         STATS_INC(b->stats.target_fail);
400         return false;
401 }
402
403 /*
404  * Notify the host about allocated page so that host can use it without
405  * fear that guest will need it. Host may reject some pages, we need to
406  * check the return value and maybe submit a different page.
407  */
408 static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn,
409                                 unsigned int *hv_status, unsigned int *target)
410 {
411         unsigned long status, dummy = 0;
412         u32 pfn32;
413
414         pfn32 = (u32)pfn;
415         if (pfn32 != pfn)
416                 return -EINVAL;
417
418         STATS_INC(b->stats.lock[false]);
419
420         *hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy, *target);
421         if (vmballoon_check_status(b, status))
422                 return 0;
423
424         pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
425         STATS_INC(b->stats.lock_fail[false]);
426         return -EIO;
427 }
428
429 static int vmballoon_send_batched_lock(struct vmballoon *b,
430                 unsigned int num_pages, bool is_2m_pages, unsigned int *target)
431 {
432         unsigned long status;
433         unsigned long pfn = PHYS_PFN(virt_to_phys(b->batch_page));
434
435         STATS_INC(b->stats.lock[is_2m_pages]);
436
437         if (is_2m_pages)
438                 status = VMWARE_BALLOON_CMD(BATCHED_2M_LOCK, pfn, num_pages,
439                                 *target);
440         else
441                 status = VMWARE_BALLOON_CMD(BATCHED_LOCK, pfn, num_pages,
442                                 *target);
443
444         if (vmballoon_check_status(b, status))
445                 return 0;
446
447         pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
448         STATS_INC(b->stats.lock_fail[is_2m_pages]);
449         return 1;
450 }
451
452 /*
453  * Notify the host that guest intends to release given page back into
454  * the pool of available (to the guest) pages.
455  */
456 static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn,
457                                                         unsigned int *target)
458 {
459         unsigned long status, dummy = 0;
460         u32 pfn32;
461
462         pfn32 = (u32)pfn;
463         if (pfn32 != pfn)
464                 return false;
465
466         STATS_INC(b->stats.unlock[false]);
467
468         status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy, *target);
469         if (vmballoon_check_status(b, status))
470                 return true;
471
472         pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
473         STATS_INC(b->stats.unlock_fail[false]);
474         return false;
475 }
476
477 static bool vmballoon_send_batched_unlock(struct vmballoon *b,
478                 unsigned int num_pages, bool is_2m_pages, unsigned int *target)
479 {
480         unsigned long status;
481         unsigned long pfn = PHYS_PFN(virt_to_phys(b->batch_page));
482
483         STATS_INC(b->stats.unlock[is_2m_pages]);
484
485         if (is_2m_pages)
486                 status = VMWARE_BALLOON_CMD(BATCHED_2M_UNLOCK, pfn, num_pages,
487                                 *target);
488         else
489                 status = VMWARE_BALLOON_CMD(BATCHED_UNLOCK, pfn, num_pages,
490                                 *target);
491
492         if (vmballoon_check_status(b, status))
493                 return true;
494
495         pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
496         STATS_INC(b->stats.unlock_fail[is_2m_pages]);
497         return false;
498 }
499
500 static struct page *vmballoon_alloc_page(gfp_t flags, bool is_2m_page)
501 {
502         if (is_2m_page)
503                 return alloc_pages(flags, VMW_BALLOON_2M_SHIFT);
504
505         return alloc_page(flags);
506 }
507
508 static void vmballoon_free_page(struct page *page, bool is_2m_page)
509 {
510         if (is_2m_page)
511                 __free_pages(page, VMW_BALLOON_2M_SHIFT);
512         else
513                 __free_page(page);
514 }
515
516 /*
517  * Quickly release all pages allocated for the balloon. This function is
518  * called when host decides to "reset" balloon for one reason or another.
519  * Unlike normal "deflate" we do not (shall not) notify host of the pages
520  * being released.
521  */
522 static void vmballoon_pop(struct vmballoon *b)
523 {
524         struct page *page, *next;
525         unsigned is_2m_pages;
526
527         for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
528                         is_2m_pages++) {
529                 struct vmballoon_page_size *page_size =
530                                 &b->page_sizes[is_2m_pages];
531                 u16 size_per_page = vmballoon_page_size(is_2m_pages);
532
533                 list_for_each_entry_safe(page, next, &page_size->pages, lru) {
534                         list_del(&page->lru);
535                         vmballoon_free_page(page, is_2m_pages);
536                         STATS_INC(b->stats.free[is_2m_pages]);
537                         b->size -= size_per_page;
538                         cond_resched();
539                 }
540         }
541
542         /* Clearing the batch_page unconditionally has no adverse effect */
543         free_page((unsigned long)b->batch_page);
544         b->batch_page = NULL;
545 }
546
547 /*
548  * Notify the host of a ballooned page. If host rejects the page put it on the
549  * refuse list, those refused page are then released at the end of the
550  * inflation cycle.
551  */
552 static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages,
553                                 bool is_2m_pages, unsigned int *target)
554 {
555         int locked, hv_status;
556         struct page *page = b->page;
557         struct vmballoon_page_size *page_size = &b->page_sizes[false];
558
559         /* is_2m_pages can never happen as 2m pages support implies batching */
560
561         locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status,
562                                                                 target);
563         if (locked) {
564                 STATS_INC(b->stats.refused_alloc[false]);
565
566                 if (locked == -EIO &&
567                     (hv_status == VMW_BALLOON_ERROR_RESET ||
568                      hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED)) {
569                         vmballoon_free_page(page, false);
570                         return -EIO;
571                 }
572
573                 /*
574                  * Place page on the list of non-balloonable pages
575                  * and retry allocation, unless we already accumulated
576                  * too many of them, in which case take a breather.
577                  */
578                 if (page_size->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
579                         page_size->n_refused_pages++;
580                         list_add(&page->lru, &page_size->refused_pages);
581                 } else {
582                         vmballoon_free_page(page, false);
583                 }
584                 return locked;
585         }
586
587         /* track allocated page */
588         list_add(&page->lru, &page_size->pages);
589
590         /* update balloon size */
591         b->size++;
592
593         return 0;
594 }
595
596 static int vmballoon_lock_batched_page(struct vmballoon *b,
597                 unsigned int num_pages, bool is_2m_pages, unsigned int *target)
598 {
599         int locked, i;
600         u16 size_per_page = vmballoon_page_size(is_2m_pages);
601
602         locked = vmballoon_send_batched_lock(b, num_pages, is_2m_pages,
603                         target);
604         if (locked > 0) {
605                 for (i = 0; i < num_pages; i++) {
606                         u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
607                         struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
608
609                         vmballoon_free_page(p, is_2m_pages);
610                 }
611
612                 return -EIO;
613         }
614
615         for (i = 0; i < num_pages; i++) {
616                 u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
617                 struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
618                 struct vmballoon_page_size *page_size =
619                                 &b->page_sizes[is_2m_pages];
620
621                 locked = vmballoon_batch_get_status(b->batch_page, i);
622
623                 switch (locked) {
624                 case VMW_BALLOON_SUCCESS:
625                         list_add(&p->lru, &page_size->pages);
626                         b->size += size_per_page;
627                         break;
628                 case VMW_BALLOON_ERROR_PPN_PINNED:
629                 case VMW_BALLOON_ERROR_PPN_INVALID:
630                         if (page_size->n_refused_pages
631                                         < VMW_BALLOON_MAX_REFUSED) {
632                                 list_add(&p->lru, &page_size->refused_pages);
633                                 page_size->n_refused_pages++;
634                                 break;
635                         }
636                         /* Fallthrough */
637                 case VMW_BALLOON_ERROR_RESET:
638                 case VMW_BALLOON_ERROR_PPN_NOTNEEDED:
639                         vmballoon_free_page(p, is_2m_pages);
640                         break;
641                 default:
642                         /* This should never happen */
643                         WARN_ON_ONCE(true);
644                 }
645         }
646
647         return 0;
648 }
649
650 /*
651  * Release the page allocated for the balloon. Note that we first notify
652  * the host so it can make sure the page will be available for the guest
653  * to use, if needed.
654  */
655 static int vmballoon_unlock_page(struct vmballoon *b, unsigned int num_pages,
656                 bool is_2m_pages, unsigned int *target)
657 {
658         struct page *page = b->page;
659         struct vmballoon_page_size *page_size = &b->page_sizes[false];
660
661         /* is_2m_pages can never happen as 2m pages support implies batching */
662
663         if (!vmballoon_send_unlock_page(b, page_to_pfn(page), target)) {
664                 list_add(&page->lru, &page_size->pages);
665                 return -EIO;
666         }
667
668         /* deallocate page */
669         vmballoon_free_page(page, false);
670         STATS_INC(b->stats.free[false]);
671
672         /* update balloon size */
673         b->size--;
674
675         return 0;
676 }
677
678 static int vmballoon_unlock_batched_page(struct vmballoon *b,
679                                 unsigned int num_pages, bool is_2m_pages,
680                                 unsigned int *target)
681 {
682         int locked, i, ret = 0;
683         bool hv_success;
684         u16 size_per_page = vmballoon_page_size(is_2m_pages);
685
686         hv_success = vmballoon_send_batched_unlock(b, num_pages, is_2m_pages,
687                         target);
688         if (!hv_success)
689                 ret = -EIO;
690
691         for (i = 0; i < num_pages; i++) {
692                 u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
693                 struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
694                 struct vmballoon_page_size *page_size =
695                                 &b->page_sizes[is_2m_pages];
696
697                 locked = vmballoon_batch_get_status(b->batch_page, i);
698                 if (!hv_success || locked != VMW_BALLOON_SUCCESS) {
699                         /*
700                          * That page wasn't successfully unlocked by the
701                          * hypervisor, re-add it to the list of pages owned by
702                          * the balloon driver.
703                          */
704                         list_add(&p->lru, &page_size->pages);
705                 } else {
706                         /* deallocate page */
707                         vmballoon_free_page(p, is_2m_pages);
708                         STATS_INC(b->stats.free[is_2m_pages]);
709
710                         /* update balloon size */
711                         b->size -= size_per_page;
712                 }
713         }
714
715         return ret;
716 }
717
718 /*
719  * Release pages that were allocated while attempting to inflate the
720  * balloon but were refused by the host for one reason or another.
721  */
722 static void vmballoon_release_refused_pages(struct vmballoon *b,
723                 bool is_2m_pages)
724 {
725         struct page *page, *next;
726         struct vmballoon_page_size *page_size =
727                         &b->page_sizes[is_2m_pages];
728
729         list_for_each_entry_safe(page, next, &page_size->refused_pages, lru) {
730                 list_del(&page->lru);
731                 vmballoon_free_page(page, is_2m_pages);
732                 STATS_INC(b->stats.refused_free[is_2m_pages]);
733         }
734
735         page_size->n_refused_pages = 0;
736 }
737
738 static void vmballoon_add_page(struct vmballoon *b, int idx, struct page *p)
739 {
740         b->page = p;
741 }
742
743 static void vmballoon_add_batched_page(struct vmballoon *b, int idx,
744                                 struct page *p)
745 {
746         vmballoon_batch_set_pa(b->batch_page, idx,
747                         (u64)page_to_pfn(p) << PAGE_SHIFT);
748 }
749
750 /*
751  * Inflate the balloon towards its target size. Note that we try to limit
752  * the rate of allocation to make sure we are not choking the rest of the
753  * system.
754  */
755 static void vmballoon_inflate(struct vmballoon *b)
756 {
757         unsigned int num_pages = 0;
758         int error = 0;
759         gfp_t flags = VMW_PAGE_ALLOC_NOSLEEP;
760         bool is_2m_pages;
761
762         pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
763
764         /*
765          * First try NOSLEEP page allocations to inflate balloon.
766          *
767          * If we do not throttle nosleep allocations, we can drain all
768          * free pages in the guest quickly (if the balloon target is high).
769          * As a side-effect, draining free pages helps to inform (force)
770          * the guest to start swapping if balloon target is not met yet,
771          * which is a desired behavior. However, balloon driver can consume
772          * all available CPU cycles if too many pages are allocated in a
773          * second. Therefore, we throttle nosleep allocations even when
774          * the guest is not under memory pressure. OTOH, if we have already
775          * predicted that the guest is under memory pressure, then we
776          * slowdown page allocations considerably.
777          */
778
779         /*
780          * Start with no sleep allocation rate which may be higher
781          * than sleeping allocation rate.
782          */
783         is_2m_pages = b->supported_page_sizes == VMW_BALLOON_NUM_PAGE_SIZES;
784
785         pr_debug("%s - goal: %d",  __func__, b->target - b->size);
786
787         while (!b->reset_required &&
788                 b->size + num_pages * vmballoon_page_size(is_2m_pages)
789                 < b->target) {
790                 struct page *page;
791
792                 if (flags == VMW_PAGE_ALLOC_NOSLEEP)
793                         STATS_INC(b->stats.alloc[is_2m_pages]);
794                 else
795                         STATS_INC(b->stats.sleep_alloc);
796
797                 page = vmballoon_alloc_page(flags, is_2m_pages);
798                 if (!page) {
799                         STATS_INC(b->stats.alloc_fail[is_2m_pages]);
800
801                         if (is_2m_pages) {
802                                 b->ops->lock(b, num_pages, true, &b->target);
803
804                                 /*
805                                  * ignore errors from locking as we now switch
806                                  * to 4k pages and we might get different
807                                  * errors.
808                                  */
809
810                                 num_pages = 0;
811                                 is_2m_pages = false;
812                                 continue;
813                         }
814
815                         if (flags == VMW_PAGE_ALLOC_CANSLEEP) {
816                                 /*
817                                  * CANSLEEP page allocation failed, so guest
818                                  * is under severe memory pressure. We just log
819                                  * the event, but do not stop the inflation
820                                  * due to its negative impact on performance.
821                                  */
822                                 STATS_INC(b->stats.sleep_alloc_fail);
823                                 break;
824                         }
825
826                         /*
827                          * NOSLEEP page allocation failed, so the guest is
828                          * under memory pressure. Slowing down page alloctions
829                          * seems to be reasonable, but doing so might actually
830                          * cause the hypervisor to throttle us down, resulting
831                          * in degraded performance. We will count on the
832                          * scheduler and standard memory management mechanisms
833                          * for now.
834                          */
835                         flags = VMW_PAGE_ALLOC_CANSLEEP;
836                         continue;
837                 }
838
839                 b->ops->add_page(b, num_pages++, page);
840                 if (num_pages == b->batch_max_pages) {
841                         error = b->ops->lock(b, num_pages, is_2m_pages,
842                                         &b->target);
843                         num_pages = 0;
844                         if (error)
845                                 break;
846                 }
847
848                 cond_resched();
849         }
850
851         if (num_pages > 0)
852                 b->ops->lock(b, num_pages, is_2m_pages, &b->target);
853
854         vmballoon_release_refused_pages(b, true);
855         vmballoon_release_refused_pages(b, false);
856 }
857
858 /*
859  * Decrease the size of the balloon allowing guest to use more memory.
860  */
861 static void vmballoon_deflate(struct vmballoon *b)
862 {
863         unsigned is_2m_pages;
864
865         pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
866
867         /* free pages to reach target */
868         for (is_2m_pages = 0; is_2m_pages < b->supported_page_sizes;
869                         is_2m_pages++) {
870                 struct page *page, *next;
871                 unsigned int num_pages = 0;
872                 struct vmballoon_page_size *page_size =
873                                 &b->page_sizes[is_2m_pages];
874
875                 list_for_each_entry_safe(page, next, &page_size->pages, lru) {
876                         if (b->reset_required ||
877                                 (b->target > 0 &&
878                                         b->size - num_pages
879                                         * vmballoon_page_size(is_2m_pages)
880                                 < b->target + vmballoon_page_size(true)))
881                                 break;
882
883                         list_del(&page->lru);
884                         b->ops->add_page(b, num_pages++, page);
885
886                         if (num_pages == b->batch_max_pages) {
887                                 int error;
888
889                                 error = b->ops->unlock(b, num_pages,
890                                                 is_2m_pages, &b->target);
891                                 num_pages = 0;
892                                 if (error)
893                                         return;
894                         }
895
896                         cond_resched();
897                 }
898
899                 if (num_pages > 0)
900                         b->ops->unlock(b, num_pages, is_2m_pages, &b->target);
901         }
902 }
903
904 static const struct vmballoon_ops vmballoon_basic_ops = {
905         .add_page = vmballoon_add_page,
906         .lock = vmballoon_lock_page,
907         .unlock = vmballoon_unlock_page
908 };
909
910 static const struct vmballoon_ops vmballoon_batched_ops = {
911         .add_page = vmballoon_add_batched_page,
912         .lock = vmballoon_lock_batched_page,
913         .unlock = vmballoon_unlock_batched_page
914 };
915
916 static bool vmballoon_init_batching(struct vmballoon *b)
917 {
918         struct page *page;
919
920         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
921         if (!page)
922                 return false;
923
924         b->batch_page = page_address(page);
925         return true;
926 }
927
928 /*
929  * Receive notification and resize balloon
930  */
931 static void vmballoon_doorbell(void *client_data)
932 {
933         struct vmballoon *b = client_data;
934
935         STATS_INC(b->stats.doorbell);
936
937         mod_delayed_work(system_freezable_wq, &b->dwork, 0);
938 }
939
940 /*
941  * Clean up vmci doorbell
942  */
943 static void vmballoon_vmci_cleanup(struct vmballoon *b)
944 {
945         int error;
946
947         VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, VMCI_INVALID_ID,
948                         VMCI_INVALID_ID, error);
949         STATS_INC(b->stats.doorbell_unset);
950
951         if (!vmci_handle_is_invalid(b->vmci_doorbell)) {
952                 vmci_doorbell_destroy(b->vmci_doorbell);
953                 b->vmci_doorbell = VMCI_INVALID_HANDLE;
954         }
955 }
956
957 /*
958  * Initialize vmci doorbell, to get notified as soon as balloon changes
959  */
960 static int vmballoon_vmci_init(struct vmballoon *b)
961 {
962         unsigned long error, dummy;
963
964         if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) == 0)
965                 return 0;
966
967         error = vmci_doorbell_create(&b->vmci_doorbell, VMCI_FLAG_DELAYED_CB,
968                                      VMCI_PRIVILEGE_FLAG_RESTRICTED,
969                                      vmballoon_doorbell, b);
970
971         if (error != VMCI_SUCCESS)
972                 goto fail;
973
974         error = VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, b->vmci_doorbell.context,
975                                    b->vmci_doorbell.resource, dummy);
976
977         STATS_INC(b->stats.doorbell_set);
978
979         if (error != VMW_BALLOON_SUCCESS)
980                 goto fail;
981
982         return 0;
983 fail:
984         vmballoon_vmci_cleanup(b);
985         return -EIO;
986 }
987
988 /*
989  * Perform standard reset sequence by popping the balloon (in case it
990  * is not  empty) and then restarting protocol. This operation normally
991  * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
992  */
993 static void vmballoon_reset(struct vmballoon *b)
994 {
995         int error;
996
997         vmballoon_vmci_cleanup(b);
998
999         /* free all pages, skipping monitor unlock */
1000         vmballoon_pop(b);
1001
1002         if (!vmballoon_send_start(b, VMW_BALLOON_CAPABILITIES))
1003                 return;
1004
1005         if ((b->capabilities & VMW_BALLOON_BATCHED_CMDS) != 0) {
1006                 b->ops = &vmballoon_batched_ops;
1007                 b->batch_max_pages = VMW_BALLOON_BATCH_MAX_PAGES;
1008                 if (!vmballoon_init_batching(b)) {
1009                         /*
1010                          * We failed to initialize batching, inform the monitor
1011                          * about it by sending a null capability.
1012                          *
1013                          * The guest will retry in one second.
1014                          */
1015                         vmballoon_send_start(b, 0);
1016                         return;
1017                 }
1018         } else if ((b->capabilities & VMW_BALLOON_BASIC_CMDS) != 0) {
1019                 b->ops = &vmballoon_basic_ops;
1020                 b->batch_max_pages = 1;
1021         }
1022
1023         b->reset_required = false;
1024
1025         error = vmballoon_vmci_init(b);
1026         if (error)
1027                 pr_err("failed to initialize vmci doorbell\n");
1028
1029         if (!vmballoon_send_guest_id(b))
1030                 pr_err("failed to send guest ID to the host\n");
1031 }
1032
1033 /*
1034  * Balloon work function: reset protocol, if needed, get the new size and
1035  * adjust balloon as needed. Repeat in 1 sec.
1036  */
1037 static void vmballoon_work(struct work_struct *work)
1038 {
1039         struct delayed_work *dwork = to_delayed_work(work);
1040         struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
1041         unsigned int target;
1042
1043         STATS_INC(b->stats.timer);
1044
1045         if (b->reset_required)
1046                 vmballoon_reset(b);
1047
1048         if (!b->reset_required && vmballoon_send_get_target(b, &target)) {
1049                 /* update target, adjust size */
1050                 b->target = target;
1051
1052                 if (b->size < target)
1053                         vmballoon_inflate(b);
1054                 else if (target == 0 ||
1055                                 b->size > target + vmballoon_page_size(true))
1056                         vmballoon_deflate(b);
1057         }
1058
1059         /*
1060          * We are using a freezable workqueue so that balloon operations are
1061          * stopped while the system transitions to/from sleep/hibernation.
1062          */
1063         queue_delayed_work(system_freezable_wq,
1064                            dwork, round_jiffies_relative(HZ));
1065 }
1066
1067 /*
1068  * DEBUGFS Interface
1069  */
1070 #ifdef CONFIG_DEBUG_FS
1071
1072 static int vmballoon_debug_show(struct seq_file *f, void *offset)
1073 {
1074         struct vmballoon *b = f->private;
1075         struct vmballoon_stats *stats = &b->stats;
1076
1077         /* format capabilities info */
1078         seq_printf(f,
1079                    "balloon capabilities:   %#4x\n"
1080                    "used capabilities:      %#4lx\n"
1081                    "is resetting:           %c\n",
1082                    VMW_BALLOON_CAPABILITIES, b->capabilities,
1083                    b->reset_required ? 'y' : 'n');
1084
1085         /* format size info */
1086         seq_printf(f,
1087                    "target:             %8d pages\n"
1088                    "current:            %8d pages\n",
1089                    b->target, b->size);
1090
1091         seq_printf(f,
1092                    "\n"
1093                    "timer:              %8u\n"
1094                    "doorbell:           %8u\n"
1095                    "start:              %8u (%4u failed)\n"
1096                    "guestType:          %8u (%4u failed)\n"
1097                    "2m-lock:            %8u (%4u failed)\n"
1098                    "lock:               %8u (%4u failed)\n"
1099                    "2m-unlock:          %8u (%4u failed)\n"
1100                    "unlock:             %8u (%4u failed)\n"
1101                    "target:             %8u (%4u failed)\n"
1102                    "prim2mAlloc:        %8u (%4u failed)\n"
1103                    "primNoSleepAlloc:   %8u (%4u failed)\n"
1104                    "primCanSleepAlloc:  %8u (%4u failed)\n"
1105                    "prim2mFree:         %8u\n"
1106                    "primFree:           %8u\n"
1107                    "err2mAlloc:         %8u\n"
1108                    "errAlloc:           %8u\n"
1109                    "err2mFree:          %8u\n"
1110                    "errFree:            %8u\n"
1111                    "doorbellSet:        %8u\n"
1112                    "doorbellUnset:      %8u\n",
1113                    stats->timer,
1114                    stats->doorbell,
1115                    stats->start, stats->start_fail,
1116                    stats->guest_type, stats->guest_type_fail,
1117                    stats->lock[true],  stats->lock_fail[true],
1118                    stats->lock[false],  stats->lock_fail[false],
1119                    stats->unlock[true], stats->unlock_fail[true],
1120                    stats->unlock[false], stats->unlock_fail[false],
1121                    stats->target, stats->target_fail,
1122                    stats->alloc[true], stats->alloc_fail[true],
1123                    stats->alloc[false], stats->alloc_fail[false],
1124                    stats->sleep_alloc, stats->sleep_alloc_fail,
1125                    stats->free[true],
1126                    stats->free[false],
1127                    stats->refused_alloc[true], stats->refused_alloc[false],
1128                    stats->refused_free[true], stats->refused_free[false],
1129                    stats->doorbell_set, stats->doorbell_unset);
1130
1131         return 0;
1132 }
1133
1134 static int vmballoon_debug_open(struct inode *inode, struct file *file)
1135 {
1136         return single_open(file, vmballoon_debug_show, inode->i_private);
1137 }
1138
1139 static const struct file_operations vmballoon_debug_fops = {
1140         .owner          = THIS_MODULE,
1141         .open           = vmballoon_debug_open,
1142         .read           = seq_read,
1143         .llseek         = seq_lseek,
1144         .release        = single_release,
1145 };
1146
1147 static int __init vmballoon_debugfs_init(struct vmballoon *b)
1148 {
1149         int error;
1150
1151         b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
1152                                            &vmballoon_debug_fops);
1153         if (IS_ERR(b->dbg_entry)) {
1154                 error = PTR_ERR(b->dbg_entry);
1155                 pr_err("failed to create debugfs entry, error: %d\n", error);
1156                 return error;
1157         }
1158
1159         return 0;
1160 }
1161
1162 static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
1163 {
1164         debugfs_remove(b->dbg_entry);
1165 }
1166
1167 #else
1168
1169 static inline int vmballoon_debugfs_init(struct vmballoon *b)
1170 {
1171         return 0;
1172 }
1173
1174 static inline void vmballoon_debugfs_exit(struct vmballoon *b)
1175 {
1176 }
1177
1178 #endif  /* CONFIG_DEBUG_FS */
1179
1180 static int __init vmballoon_init(void)
1181 {
1182         int error;
1183         unsigned is_2m_pages;
1184         /*
1185          * Check if we are running on VMware's hypervisor and bail out
1186          * if we are not.
1187          */
1188         if (x86_hyper_type != X86_HYPER_VMWARE)
1189                 return -ENODEV;
1190
1191         for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
1192                         is_2m_pages++) {
1193                 INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].pages);
1194                 INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].refused_pages);
1195         }
1196
1197         INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
1198
1199         error = vmballoon_debugfs_init(&balloon);
1200         if (error)
1201                 return error;
1202
1203         balloon.vmci_doorbell = VMCI_INVALID_HANDLE;
1204         balloon.batch_page = NULL;
1205         balloon.page = NULL;
1206         balloon.reset_required = true;
1207
1208         queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
1209
1210         return 0;
1211 }
1212
1213 /*
1214  * Using late_initcall() instead of module_init() allows the balloon to use the
1215  * VMCI doorbell even when the balloon is built into the kernel. Otherwise the
1216  * VMCI is probed only after the balloon is initialized. If the balloon is used
1217  * as a module, late_initcall() is equivalent to module_init().
1218  */
1219 late_initcall(vmballoon_init);
1220
1221 static void __exit vmballoon_exit(void)
1222 {
1223         vmballoon_vmci_cleanup(&balloon);
1224         cancel_delayed_work_sync(&balloon.dwork);
1225
1226         vmballoon_debugfs_exit(&balloon);
1227
1228         /*
1229          * Deallocate all reserved memory, and reset connection with monitor.
1230          * Reset connection before deallocating memory to avoid potential for
1231          * additional spurious resets from guest touching deallocated pages.
1232          */
1233         vmballoon_send_start(&balloon, 0);
1234         vmballoon_pop(&balloon);
1235 }
1236 module_exit(vmballoon_exit);