drivers/misc/vmw_balloon.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * VMware Balloon driver.
   4  *
   5  * Copyright (C) 2000-2018, VMware, Inc. All Rights Reserved.
   6  *
   7  * This is VMware physical memory management driver for Linux. The driver
   8  * acts like a "balloon" that can be inflated to reclaim physical pages by
   9  * reserving them in the guest and invalidating them in the monitor,
  10  * freeing up the underlying machine pages so they can be allocated to
  11  * other guests.  The balloon can also be deflated to allow the guest to
  12  * use more physical memory. Higher level policies can control the sizes
  13  * of balloons in VMs in order to manage physical memory resources.
  14  */
  15
  16 //#define DEBUG
  17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  18
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/mm.h>
  22 #include <linux/vmalloc.h>
  23 #include <linux/sched.h>
  24 #include <linux/module.h>
  25 #include <linux/workqueue.h>
  26 #include <linux/debugfs.h>
  27 #include <linux/seq_file.h>
  28 #include <linux/vmw_vmci_defs.h>
  29 #include <linux/vmw_vmci_api.h>
  30 #include <asm/hypervisor.h>
  31
  32 MODULE_AUTHOR("VMware, Inc.");
  33 MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
  34 MODULE_VERSION("1.5.0.0-k");
  35 MODULE_ALIAS("dmi:*:svnVMware*:*");
  36 MODULE_ALIAS("vmware_vmmemctl");
  37 MODULE_LICENSE("GPL");
  38
  39 /*
  40  * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
  41  * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use
  42  * __GFP_NOWARN, to suppress page allocation failure warnings.
  43  */
  44 #define VMW_PAGE_ALLOC_NOSLEEP          (__GFP_HIGHMEM|__GFP_NOWARN)
  45
  46 /*
  47  * Use GFP_HIGHUSER when executing in a separate kernel thread
  48  * context and allocation can sleep.  This is less stressful to
  49  * the guest memory system, since it allows the thread to block
  50  * while memory is reclaimed, and won't take pages from emergency
  51  * low-memory pools.
  52  */
  53 #define VMW_PAGE_ALLOC_CANSLEEP         (GFP_HIGHUSER)
  54
  55 /* Maximum number of refused pages we accumulate during inflation cycle */
  56 #define VMW_BALLOON_MAX_REFUSED         16
  57
  58 /*
  59  * Hypervisor communication port definitions.
  60  */
  61 #define VMW_BALLOON_HV_PORT             0x5670
  62 #define VMW_BALLOON_HV_MAGIC            0x456c6d6f
  63 #define VMW_BALLOON_GUEST_ID            1       /* Linux */
  64
  65 enum vmwballoon_capabilities {
  66         /*
  67          * Bit 0 is reserved and not associated to any capability.
  68          */
  69         VMW_BALLOON_BASIC_CMDS                  = (1 << 1),
  70         VMW_BALLOON_BATCHED_CMDS                = (1 << 2),
  71         VMW_BALLOON_BATCHED_2M_CMDS             = (1 << 3),
  72         VMW_BALLOON_SIGNALLED_WAKEUP_CMD        = (1 << 4),
  73 };
  74
  75 #define VMW_BALLOON_CAPABILITIES        (VMW_BALLOON_BASIC_CMDS \
  76                                         | VMW_BALLOON_BATCHED_CMDS \
  77                                         | VMW_BALLOON_BATCHED_2M_CMDS \
  78                                         | VMW_BALLOON_SIGNALLED_WAKEUP_CMD)
  79
  80 #define VMW_BALLOON_2M_SHIFT            (9)
  81 #define VMW_BALLOON_NUM_PAGE_SIZES      (2)
  82
  83 /*
  84  * Backdoor commands availability:
  85  *
  86  * START, GET_TARGET and GUEST_ID are always available,
  87  *
  88  * VMW_BALLOON_BASIC_CMDS:
  89  *      LOCK and UNLOCK commands,
  90  * VMW_BALLOON_BATCHED_CMDS:
  91  *      BATCHED_LOCK and BATCHED_UNLOCK commands.
  92  * VMW BALLOON_BATCHED_2M_CMDS:
  93  *      BATCHED_2M_LOCK and BATCHED_2M_UNLOCK commands,
  94  * VMW VMW_BALLOON_SIGNALLED_WAKEUP_CMD:
  95  *      VMW_BALLOON_CMD_VMCI_DOORBELL_SET command.
  96  */
  97 #define VMW_BALLOON_CMD_START                   0
  98 #define VMW_BALLOON_CMD_GET_TARGET              1
  99 #define VMW_BALLOON_CMD_LOCK                    2
 100 #define VMW_BALLOON_CMD_UNLOCK                  3
 101 #define VMW_BALLOON_CMD_GUEST_ID                4
 102 #define VMW_BALLOON_CMD_BATCHED_LOCK            6
 103 #define VMW_BALLOON_CMD_BATCHED_UNLOCK          7
 104 #define VMW_BALLOON_CMD_BATCHED_2M_LOCK         8
 105 #define VMW_BALLOON_CMD_BATCHED_2M_UNLOCK       9
 106 #define VMW_BALLOON_CMD_VMCI_DOORBELL_SET       10
 107
 108
 109 /* error codes */
 110 #define VMW_BALLOON_SUCCESS                     0
 111 #define VMW_BALLOON_FAILURE                     -1
 112 #define VMW_BALLOON_ERROR_CMD_INVALID           1
 113 #define VMW_BALLOON_ERROR_PPN_INVALID           2
 114 #define VMW_BALLOON_ERROR_PPN_LOCKED            3
 115 #define VMW_BALLOON_ERROR_PPN_UNLOCKED          4
 116 #define VMW_BALLOON_ERROR_PPN_PINNED            5
 117 #define VMW_BALLOON_ERROR_PPN_NOTNEEDED         6
 118 #define VMW_BALLOON_ERROR_RESET                 7
 119 #define VMW_BALLOON_ERROR_BUSY                  8
 120
 121 #define VMW_BALLOON_SUCCESS_WITH_CAPABILITIES   (0x03000000)
 122
 123 /* Batch page description */
 124
 125 /*
 126  * Layout of a page in the batch page:
 127  *
 128  * +-------------+----------+--------+
 129  * |             |          |        |
 130  * | Page number | Reserved | Status |
 131  * |             |          |        |
 132  * +-------------+----------+--------+
 133  * 64  PAGE_SHIFT          6         0
 134  *
 135  * The reserved field should be set to 0.
 136  */
 137 #define VMW_BALLOON_BATCH_MAX_PAGES     (PAGE_SIZE / sizeof(u64))
 138 #define VMW_BALLOON_BATCH_STATUS_MASK   ((1UL << 5) - 1)
 139 #define VMW_BALLOON_BATCH_PAGE_MASK     (~((1UL << PAGE_SHIFT) - 1))
 140
 141 struct vmballoon_batch_page {
 142         u64 pages[VMW_BALLOON_BATCH_MAX_PAGES];
 143 };
 144
 145 static u64 vmballoon_batch_get_pa(struct vmballoon_batch_page *batch, int idx)
 146 {
 147         return batch->pages[idx] & VMW_BALLOON_BATCH_PAGE_MASK;
 148 }
 149
 150 static int vmballoon_batch_get_status(struct vmballoon_batch_page *batch,
 151                                 int idx)
 152 {
 153         return (int)(batch->pages[idx] & VMW_BALLOON_BATCH_STATUS_MASK);
 154 }
 155
 156 static void vmballoon_batch_set_pa(struct vmballoon_batch_page *batch, int idx,
 157                                 u64 pa)
 158 {
 159         batch->pages[idx] = pa;
 160 }
 161
 162
 163 #define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result)             \
 164 ({                                                              \
 165         unsigned long __status, __dummy1, __dummy2, __dummy3;   \
 166         __asm__ __volatile__ ("inl %%dx" :                      \
 167                 "=a"(__status),                                 \
 168                 "=c"(__dummy1),                                 \
 169                 "=d"(__dummy2),                                 \
 170                 "=b"(result),                                   \
 171                 "=S" (__dummy3) :                               \
 172                 "0"(VMW_BALLOON_HV_MAGIC),                      \
 173                 "1"(VMW_BALLOON_CMD_##cmd),                     \
 174                 "2"(VMW_BALLOON_HV_PORT),                       \
 175                 "3"(arg1),                                      \
 176                 "4" (arg2) :                                    \
 177                 "memory");                                      \
 178         if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START)     \
 179                 result = __dummy1;                              \
 180         result &= -1UL;                                         \
 181         __status & -1UL;                                        \
 182 })
 183
 184 #ifdef CONFIG_DEBUG_FS
 185 struct vmballoon_stats {
 186         unsigned int timer;
 187         unsigned int doorbell;
 188
 189         /* allocation statistics */
 190         unsigned int alloc[VMW_BALLOON_NUM_PAGE_SIZES];
 191         unsigned int alloc_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 192         unsigned int sleep_alloc;
 193         unsigned int sleep_alloc_fail;
 194         unsigned int refused_alloc[VMW_BALLOON_NUM_PAGE_SIZES];
 195         unsigned int refused_free[VMW_BALLOON_NUM_PAGE_SIZES];
 196         unsigned int free[VMW_BALLOON_NUM_PAGE_SIZES];
 197
 198         /* monitor operations */
 199         unsigned int lock[VMW_BALLOON_NUM_PAGE_SIZES];
 200         unsigned int lock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 201         unsigned int unlock[VMW_BALLOON_NUM_PAGE_SIZES];
 202         unsigned int unlock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 203         unsigned int target;
 204         unsigned int target_fail;
 205         unsigned int start;
 206         unsigned int start_fail;
 207         unsigned int guest_type;
 208         unsigned int guest_type_fail;
 209         unsigned int doorbell_set;
 210         unsigned int doorbell_unset;
 211 };
 212
 213 #define STATS_INC(stat) (stat)++
 214 #else
 215 #define STATS_INC(stat)
 216 #endif
 217
 218 struct vmballoon;
 219
 220 struct vmballoon_ops {
 221         void (*add_page)(struct vmballoon *b, int idx, struct page *p);
 222         int (*lock)(struct vmballoon *b, unsigned int num_pages,
 223                         bool is_2m_pages, unsigned int *target);
 224         int (*unlock)(struct vmballoon *b, unsigned int num_pages,
 225                         bool is_2m_pages, unsigned int *target);
 226 };
 227
 228 struct vmballoon_page_size {
 229         /* list of reserved physical pages */
 230         struct list_head pages;
 231
 232         /* transient list of non-balloonable pages */
 233         struct list_head refused_pages;
 234         unsigned int n_refused_pages;
 235 };
 236
 237 struct vmballoon {
 238         struct vmballoon_page_size page_sizes[VMW_BALLOON_NUM_PAGE_SIZES];
 239
 240         /* supported page sizes. 1 == 4k pages only, 2 == 4k and 2m pages */
 241         unsigned supported_page_sizes;
 242
 243         /* balloon size in pages */
 244         unsigned int size;
 245         unsigned int target;
 246
 247         /* reset flag */
 248         bool reset_required;
 249
 250         unsigned long capabilities;
 251
 252         struct vmballoon_batch_page *batch_page;
 253         unsigned int batch_max_pages;
 254         struct page *page;
 255
 256         const struct vmballoon_ops *ops;
 257
 258 #ifdef CONFIG_DEBUG_FS
 259         /* statistics */
 260         struct vmballoon_stats stats;
 261
 262         /* debugfs file exporting statistics */
 263         struct dentry *dbg_entry;
 264 #endif
 265
 266         struct sysinfo sysinfo;
 267
 268         struct delayed_work dwork;
 269
 270         struct vmci_handle vmci_doorbell;
 271 };
 272
 273 static struct vmballoon balloon;
 274
 275 /*
 276  * Send "start" command to the host, communicating supported version
 277  * of the protocol.
 278  */
 279 static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps)
 280 {
 281         unsigned long status, capabilities, dummy = 0;
 282         bool success;
 283
 284         STATS_INC(b->stats.start);
 285
 286         status = VMWARE_BALLOON_CMD(START, req_caps, dummy, capabilities);
 287
 288         switch (status) {
 289         case VMW_BALLOON_SUCCESS_WITH_CAPABILITIES:
 290                 b->capabilities = capabilities;
 291                 success = true;
 292                 break;
 293         case VMW_BALLOON_SUCCESS:
 294                 b->capabilities = VMW_BALLOON_BASIC_CMDS;
 295                 success = true;
 296                 break;
 297         default:
 298                 success = false;
 299         }
 300
 301         /*
 302          * 2MB pages are only supported with batching. If batching is for some
 303          * reason disabled, do not use 2MB pages, since otherwise the legacy
 304          * mechanism is used with 2MB pages, causing a failure.
 305          */
 306         if ((b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS) &&
 307             (b->capabilities & VMW_BALLOON_BATCHED_CMDS))
 308                 b->supported_page_sizes = 2;
 309         else
 310                 b->supported_page_sizes = 1;
 311
 312         if (!success) {
 313                 pr_debug("%s - failed, hv returns %ld\n", __func__, status);
 314                 STATS_INC(b->stats.start_fail);
 315         }
 316         return success;
 317 }
 318
 319 static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
 320 {
 321         switch (status) {
 322         case VMW_BALLOON_SUCCESS:
 323                 return true;
 324
 325         case VMW_BALLOON_ERROR_RESET:
 326                 b->reset_required = true;
 327                 /* fall through */
 328
 329         default:
 330                 return false;
 331         }
 332 }
 333
 334 /*
 335  * Communicate guest type to the host so that it can adjust ballooning
 336  * algorithm to the one most appropriate for the guest. This command
 337  * is normally issued after sending "start" command and is part of
 338  * standard reset sequence.
 339  */
 340 static bool vmballoon_send_guest_id(struct vmballoon *b)
 341 {
 342         unsigned long status, dummy = 0;
 343
 344         status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy,
 345                                 dummy);
 346
 347         STATS_INC(b->stats.guest_type);
 348
 349         if (vmballoon_check_status(b, status))
 350                 return true;
 351
 352         pr_debug("%s - failed, hv returns %ld\n", __func__, status);
 353         STATS_INC(b->stats.guest_type_fail);
 354         return false;
 355 }
 356
 357 static u16 vmballoon_page_size(bool is_2m_page)
 358 {
 359         if (is_2m_page)
 360                 return 1 << VMW_BALLOON_2M_SHIFT;
 361
 362         return 1;
 363 }
 364
 365 /*
 366  * Retrieve desired balloon size from the host.
 367  */
 368 static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
 369 {
 370         unsigned long status;
 371         unsigned long target;
 372         unsigned long limit;
 373         unsigned long dummy = 0;
 374         u32 limit32;
 375
 376         /*
 377          * si_meminfo() is cheap. Moreover, we want to provide dynamic
 378          * max balloon size later. So let us call si_meminfo() every
 379          * iteration.
 380          */
 381         si_meminfo(&b->sysinfo);
 382         limit = b->sysinfo.totalram;
 383
 384         /* Ensure limit fits in 32-bits */
 385         limit32 = (u32)limit;
 386         if (limit != limit32)
 387                 return false;
 388
 389         /* update stats */
 390         STATS_INC(b->stats.target);
 391
 392         status = VMWARE_BALLOON_CMD(GET_TARGET, limit, dummy, target);
 393         if (vmballoon_check_status(b, status)) {
 394                 *new_target = target;
 395                 return true;
 396         }
 397
 398         pr_debug("%s - failed, hv returns %ld\n", __func__, status);
 399         STATS_INC(b->stats.target_fail);
 400         return false;
 401 }
 402
 403 /*
 404  * Notify the host about allocated page so that host can use it without
 405  * fear that guest will need it. Host may reject some pages, we need to
 406  * check the return value and maybe submit a different page.
 407  */
 408 static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn,
 409                                 unsigned int *hv_status, unsigned int *target)
 410 {
 411         unsigned long status, dummy = 0;
 412         u32 pfn32;
 413
 414         pfn32 = (u32)pfn;
 415         if (pfn32 != pfn)
 416                 return -EINVAL;
 417
 418         STATS_INC(b->stats.lock[false]);
 419
 420         *hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy, *target);
 421         if (vmballoon_check_status(b, status))
 422                 return 0;
 423
 424         pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
 425         STATS_INC(b->stats.lock_fail[false]);
 426         return -EIO;
 427 }
 428
 429 static int vmballoon_send_batched_lock(struct vmballoon *b,
 430                 unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 431 {
 432         unsigned long status;
 433         unsigned long pfn = PHYS_PFN(virt_to_phys(b->batch_page));
 434
 435         STATS_INC(b->stats.lock[is_2m_pages]);
 436
 437         if (is_2m_pages)
 438                 status = VMWARE_BALLOON_CMD(BATCHED_2M_LOCK, pfn, num_pages,
 439                                 *target);
 440         else
 441                 status = VMWARE_BALLOON_CMD(BATCHED_LOCK, pfn, num_pages,
 442                                 *target);
 443
 444         if (vmballoon_check_status(b, status))
 445                 return 0;
 446
 447         pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
 448         STATS_INC(b->stats.lock_fail[is_2m_pages]);
 449         return 1;
 450 }
 451
 452 /*
 453  * Notify the host that guest intends to release given page back into
 454  * the pool of available (to the guest) pages.
 455  */
 456 static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn,
 457                                                         unsigned int *target)
 458 {
 459         unsigned long status, dummy = 0;
 460         u32 pfn32;
 461
 462         pfn32 = (u32)pfn;
 463         if (pfn32 != pfn)
 464                 return false;
 465
 466         STATS_INC(b->stats.unlock[false]);
 467
 468         status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy, *target);
 469         if (vmballoon_check_status(b, status))
 470                 return true;
 471
 472         pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
 473         STATS_INC(b->stats.unlock_fail[false]);
 474         return false;
 475 }
 476
 477 static bool vmballoon_send_batched_unlock(struct vmballoon *b,
 478                 unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 479 {
 480         unsigned long status;
 481         unsigned long pfn = PHYS_PFN(virt_to_phys(b->batch_page));
 482
 483         STATS_INC(b->stats.unlock[is_2m_pages]);
 484
 485         if (is_2m_pages)
 486                 status = VMWARE_BALLOON_CMD(BATCHED_2M_UNLOCK, pfn, num_pages,
 487                                 *target);
 488         else
 489                 status = VMWARE_BALLOON_CMD(BATCHED_UNLOCK, pfn, num_pages,
 490                                 *target);
 491
 492         if (vmballoon_check_status(b, status))
 493                 return true;
 494
 495         pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
 496         STATS_INC(b->stats.unlock_fail[is_2m_pages]);
 497         return false;
 498 }
 499
 500 static struct page *vmballoon_alloc_page(gfp_t flags, bool is_2m_page)
 501 {
 502         if (is_2m_page)
 503                 return alloc_pages(flags, VMW_BALLOON_2M_SHIFT);
 504
 505         return alloc_page(flags);
 506 }
 507
 508 static void vmballoon_free_page(struct page *page, bool is_2m_page)
 509 {
 510         if (is_2m_page)
 511                 __free_pages(page, VMW_BALLOON_2M_SHIFT);
 512         else
 513                 __free_page(page);
 514 }
 515
 516 /*
 517  * Quickly release all pages allocated for the balloon. This function is
 518  * called when host decides to "reset" balloon for one reason or another.
 519  * Unlike normal "deflate" we do not (shall not) notify host of the pages
 520  * being released.
 521  */
 522 static void vmballoon_pop(struct vmballoon *b)
 523 {
 524         struct page *page, *next;
 525         unsigned is_2m_pages;
 526
 527         for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
 528                         is_2m_pages++) {
 529                 struct vmballoon_page_size *page_size =
 530                                 &b->page_sizes[is_2m_pages];
 531                 u16 size_per_page = vmballoon_page_size(is_2m_pages);
 532
 533                 list_for_each_entry_safe(page, next, &page_size->pages, lru) {
 534                         list_del(&page->lru);
 535                         vmballoon_free_page(page, is_2m_pages);
 536                         STATS_INC(b->stats.free[is_2m_pages]);
 537                         b->size -= size_per_page;
 538                         cond_resched();
 539                 }
 540         }
 541
 542         /* Clearing the batch_page unconditionally has no adverse effect */
 543         free_page((unsigned long)b->batch_page);
 544         b->batch_page = NULL;
 545 }
 546
 547 /*
 548  * Notify the host of a ballooned page. If host rejects the page put it on the
 549  * refuse list, those refused page are then released at the end of the
 550  * inflation cycle.
 551  */
 552 static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages,
 553                                 bool is_2m_pages, unsigned int *target)
 554 {
 555         int locked, hv_status;
 556         struct page *page = b->page;
 557         struct vmballoon_page_size *page_size = &b->page_sizes[false];
 558
 559         /* is_2m_pages can never happen as 2m pages support implies batching */
 560
 561         locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status,
 562                                                                 target);
 563         if (locked) {
 564                 STATS_INC(b->stats.refused_alloc[false]);
 565
 566                 if (locked == -EIO &&
 567                     (hv_status == VMW_BALLOON_ERROR_RESET ||
 568                      hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED)) {
 569                         vmballoon_free_page(page, false);
 570                         return -EIO;
 571                 }
 572
 573                 /*
 574                  * Place page on the list of non-balloonable pages
 575                  * and retry allocation, unless we already accumulated
 576                  * too many of them, in which case take a breather.
 577                  */
 578                 if (page_size->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
 579                         page_size->n_refused_pages++;
 580                         list_add(&page->lru, &page_size->refused_pages);
 581                 } else {
 582                         vmballoon_free_page(page, false);
 583                 }
 584                 return locked;
 585         }
 586
 587         /* track allocated page */
 588         list_add(&page->lru, &page_size->pages);
 589
 590         /* update balloon size */
 591         b->size++;
 592
 593         return 0;
 594 }
 595
 596 static int vmballoon_lock_batched_page(struct vmballoon *b,
 597                 unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 598 {
 599         int locked, i;
 600         u16 size_per_page = vmballoon_page_size(is_2m_pages);
 601
 602         locked = vmballoon_send_batched_lock(b, num_pages, is_2m_pages,
 603                         target);
 604         if (locked > 0) {
 605                 for (i = 0; i < num_pages; i++) {
 606                         u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 607                         struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 608
 609                         vmballoon_free_page(p, is_2m_pages);
 610                 }
 611
 612                 return -EIO;
 613         }
 614
 615         for (i = 0; i < num_pages; i++) {
 616                 u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 617                 struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 618                 struct vmballoon_page_size *page_size =
 619                                 &b->page_sizes[is_2m_pages];
 620
 621                 locked = vmballoon_batch_get_status(b->batch_page, i);
 622
 623                 switch (locked) {
 624                 case VMW_BALLOON_SUCCESS:
 625                         list_add(&p->lru, &page_size->pages);
 626                         b->size += size_per_page;
 627                         break;
 628                 case VMW_BALLOON_ERROR_PPN_PINNED:
 629                 case VMW_BALLOON_ERROR_PPN_INVALID:
 630                         if (page_size->n_refused_pages
 631                                         < VMW_BALLOON_MAX_REFUSED) {
 632                                 list_add(&p->lru, &page_size->refused_pages);
 633                                 page_size->n_refused_pages++;
 634                                 break;
 635                         }
 636                         /* Fallthrough */
 637                 case VMW_BALLOON_ERROR_RESET:
 638                 case VMW_BALLOON_ERROR_PPN_NOTNEEDED:
 639                         vmballoon_free_page(p, is_2m_pages);
 640                         break;
 641                 default:
 642                         /* This should never happen */
 643                         WARN_ON_ONCE(true);
 644                 }
 645         }
 646
 647         return 0;
 648 }
 649
 650 /*
 651  * Release the page allocated for the balloon. Note that we first notify
 652  * the host so it can make sure the page will be available for the guest
 653  * to use, if needed.
 654  */
 655 static int vmballoon_unlock_page(struct vmballoon *b, unsigned int num_pages,
 656                 bool is_2m_pages, unsigned int *target)
 657 {
 658         struct page *page = b->page;
 659         struct vmballoon_page_size *page_size = &b->page_sizes[false];
 660
 661         /* is_2m_pages can never happen as 2m pages support implies batching */
 662
 663         if (!vmballoon_send_unlock_page(b, page_to_pfn(page), target)) {
 664                 list_add(&page->lru, &page_size->pages);
 665                 return -EIO;
 666         }
 667
 668         /* deallocate page */
 669         vmballoon_free_page(page, false);
 670         STATS_INC(b->stats.free[false]);
 671
 672         /* update balloon size */
 673         b->size--;
 674
 675         return 0;
 676 }
 677
 678 static int vmballoon_unlock_batched_page(struct vmballoon *b,
 679                                 unsigned int num_pages, bool is_2m_pages,
 680                                 unsigned int *target)
 681 {
 682         int locked, i, ret = 0;
 683         bool hv_success;
 684         u16 size_per_page = vmballoon_page_size(is_2m_pages);
 685
 686         hv_success = vmballoon_send_batched_unlock(b, num_pages, is_2m_pages,
 687                         target);
 688         if (!hv_success)
 689                 ret = -EIO;
 690
 691         for (i = 0; i < num_pages; i++) {
 692                 u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 693                 struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 694                 struct vmballoon_page_size *page_size =
 695                                 &b->page_sizes[is_2m_pages];
 696
 697                 locked = vmballoon_batch_get_status(b->batch_page, i);
 698                 if (!hv_success || locked != VMW_BALLOON_SUCCESS) {
 699                         /*
 700                          * That page wasn't successfully unlocked by the
 701                          * hypervisor, re-add it to the list of pages owned by
 702                          * the balloon driver.
 703                          */
 704                         list_add(&p->lru, &page_size->pages);
 705                 } else {
 706                         /* deallocate page */
 707                         vmballoon_free_page(p, is_2m_pages);
 708                         STATS_INC(b->stats.free[is_2m_pages]);
 709
 710                         /* update balloon size */
 711                         b->size -= size_per_page;
 712                 }
 713         }
 714
 715         return ret;
 716 }
 717
 718 /*
 719  * Release pages that were allocated while attempting to inflate the
 720  * balloon but were refused by the host for one reason or another.
 721  */
 722 static void vmballoon_release_refused_pages(struct vmballoon *b,
 723                 bool is_2m_pages)
 724 {
 725         struct page *page, *next;
 726         struct vmballoon_page_size *page_size =
 727                         &b->page_sizes[is_2m_pages];
 728
 729         list_for_each_entry_safe(page, next, &page_size->refused_pages, lru) {
 730                 list_del(&page->lru);
 731                 vmballoon_free_page(page, is_2m_pages);
 732                 STATS_INC(b->stats.refused_free[is_2m_pages]);
 733         }
 734
 735         page_size->n_refused_pages = 0;
 736 }
 737
 738 static void vmballoon_add_page(struct vmballoon *b, int idx, struct page *p)
 739 {
 740         b->page = p;
 741 }
 742
 743 static void vmballoon_add_batched_page(struct vmballoon *b, int idx,
 744                                 struct page *p)
 745 {
 746         vmballoon_batch_set_pa(b->batch_page, idx,
 747                         (u64)page_to_pfn(p) << PAGE_SHIFT);
 748 }
 749
 750 /*
 751  * Inflate the balloon towards its target size. Note that we try to limit
 752  * the rate of allocation to make sure we are not choking the rest of the
 753  * system.
 754  */
 755 static void vmballoon_inflate(struct vmballoon *b)
 756 {
 757         unsigned int num_pages = 0;
 758         int error = 0;
 759         gfp_t flags = VMW_PAGE_ALLOC_NOSLEEP;
 760         bool is_2m_pages;
 761
 762         pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
 763
 764         /*
 765          * First try NOSLEEP page allocations to inflate balloon.
 766          *
 767          * If we do not throttle nosleep allocations, we can drain all
 768          * free pages in the guest quickly (if the balloon target is high).
 769          * As a side-effect, draining free pages helps to inform (force)
 770          * the guest to start swapping if balloon target is not met yet,
 771          * which is a desired behavior. However, balloon driver can consume
 772          * all available CPU cycles if too many pages are allocated in a
 773          * second. Therefore, we throttle nosleep allocations even when
 774          * the guest is not under memory pressure. OTOH, if we have already
 775          * predicted that the guest is under memory pressure, then we
 776          * slowdown page allocations considerably.
 777          */
 778
 779         /*
 780          * Start with no sleep allocation rate which may be higher
 781          * than sleeping allocation rate.
 782          */
 783         is_2m_pages = b->supported_page_sizes == VMW_BALLOON_NUM_PAGE_SIZES;
 784
 785         pr_debug("%s - goal: %d",  __func__, b->target - b->size);
 786
 787         while (!b->reset_required &&
 788                 b->size + num_pages * vmballoon_page_size(is_2m_pages)
 789                 < b->target) {
 790                 struct page *page;
 791
 792                 if (flags == VMW_PAGE_ALLOC_NOSLEEP)
 793                         STATS_INC(b->stats.alloc[is_2m_pages]);
 794                 else
 795                         STATS_INC(b->stats.sleep_alloc);
 796
 797                 page = vmballoon_alloc_page(flags, is_2m_pages);
 798                 if (!page) {
 799                         STATS_INC(b->stats.alloc_fail[is_2m_pages]);
 800
 801                         if (is_2m_pages) {
 802                                 b->ops->lock(b, num_pages, true, &b->target);
 803
 804                                 /*
 805                                  * ignore errors from locking as we now switch
 806                                  * to 4k pages and we might get different
 807                                  * errors.
 808                                  */
 809
 810                                 num_pages = 0;
 811                                 is_2m_pages = false;
 812                                 continue;
 813                         }
 814
 815                         if (flags == VMW_PAGE_ALLOC_CANSLEEP) {
 816                                 /*
 817                                  * CANSLEEP page allocation failed, so guest
 818                                  * is under severe memory pressure. We just log
 819                                  * the event, but do not stop the inflation
 820                                  * due to its negative impact on performance.
 821                                  */
 822                                 STATS_INC(b->stats.sleep_alloc_fail);
 823                                 break;
 824                         }
 825
 826                         /*
 827                          * NOSLEEP page allocation failed, so the guest is
 828                          * under memory pressure. Slowing down page alloctions
 829                          * seems to be reasonable, but doing so might actually
 830                          * cause the hypervisor to throttle us down, resulting
 831                          * in degraded performance. We will count on the
 832                          * scheduler and standard memory management mechanisms
 833                          * for now.
 834                          */
 835                         flags = VMW_PAGE_ALLOC_CANSLEEP;
 836                         continue;
 837                 }
 838
 839                 b->ops->add_page(b, num_pages++, page);
 840                 if (num_pages == b->batch_max_pages) {
 841                         error = b->ops->lock(b, num_pages, is_2m_pages,
 842                                         &b->target);
 843                         num_pages = 0;
 844                         if (error)
 845                                 break;
 846                 }
 847
 848                 cond_resched();
 849         }
 850
 851         if (num_pages > 0)
 852                 b->ops->lock(b, num_pages, is_2m_pages, &b->target);
 853
 854         vmballoon_release_refused_pages(b, true);
 855         vmballoon_release_refused_pages(b, false);
 856 }
 857
 858 /*
 859  * Decrease the size of the balloon allowing guest to use more memory.
 860  */
 861 static void vmballoon_deflate(struct vmballoon *b)
 862 {
 863         unsigned is_2m_pages;
 864
 865         pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
 866
 867         /* free pages to reach target */
 868         for (is_2m_pages = 0; is_2m_pages < b->supported_page_sizes;
 869                         is_2m_pages++) {
 870                 struct page *page, *next;
 871                 unsigned int num_pages = 0;
 872                 struct vmballoon_page_size *page_size =
 873                                 &b->page_sizes[is_2m_pages];
 874
 875                 list_for_each_entry_safe(page, next, &page_size->pages, lru) {
 876                         if (b->reset_required ||
 877                                 (b->target > 0 &&
 878                                         b->size - num_pages
 879                                         * vmballoon_page_size(is_2m_pages)
 880                                 < b->target + vmballoon_page_size(true)))
 881                                 break;
 882
 883                         list_del(&page->lru);
 884                         b->ops->add_page(b, num_pages++, page);
 885
 886                         if (num_pages == b->batch_max_pages) {
 887                                 int error;
 888
 889                                 error = b->ops->unlock(b, num_pages,
 890                                                 is_2m_pages, &b->target);
 891                                 num_pages = 0;
 892                                 if (error)
 893                                         return;
 894                         }
 895
 896                         cond_resched();
 897                 }
 898
 899                 if (num_pages > 0)
 900                         b->ops->unlock(b, num_pages, is_2m_pages, &b->target);
 901         }
 902 }
 903
 904 static const struct vmballoon_ops vmballoon_basic_ops = {
 905         .add_page = vmballoon_add_page,
 906         .lock = vmballoon_lock_page,
 907         .unlock = vmballoon_unlock_page
 908 };
 909
 910 static const struct vmballoon_ops vmballoon_batched_ops = {
 911         .add_page = vmballoon_add_batched_page,
 912         .lock = vmballoon_lock_batched_page,
 913         .unlock = vmballoon_unlock_batched_page
 914 };
 915
 916 static bool vmballoon_init_batching(struct vmballoon *b)
 917 {
 918         struct page *page;
 919
 920         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 921         if (!page)
 922                 return false;
 923
 924         b->batch_page = page_address(page);
 925         return true;
 926 }
 927
 928 /*
 929  * Receive notification and resize balloon
 930  */
 931 static void vmballoon_doorbell(void *client_data)
 932 {
 933         struct vmballoon *b = client_data;
 934
 935         STATS_INC(b->stats.doorbell);
 936
 937         mod_delayed_work(system_freezable_wq, &b->dwork, 0);
 938 }
 939
 940 /*
 941  * Clean up vmci doorbell
 942  */
 943 static void vmballoon_vmci_cleanup(struct vmballoon *b)
 944 {
 945         int error;
 946
 947         VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, VMCI_INVALID_ID,
 948                         VMCI_INVALID_ID, error);
 949         STATS_INC(b->stats.doorbell_unset);
 950
 951         if (!vmci_handle_is_invalid(b->vmci_doorbell)) {
 952                 vmci_doorbell_destroy(b->vmci_doorbell);
 953                 b->vmci_doorbell = VMCI_INVALID_HANDLE;
 954         }
 955 }
 956
 957 /*
 958  * Initialize vmci doorbell, to get notified as soon as balloon changes
 959  */
 960 static int vmballoon_vmci_init(struct vmballoon *b)
 961 {
 962         unsigned long error, dummy;
 963
 964         if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) == 0)
 965                 return 0;
 966
 967         error = vmci_doorbell_create(&b->vmci_doorbell, VMCI_FLAG_DELAYED_CB,
 968                                      VMCI_PRIVILEGE_FLAG_RESTRICTED,
 969                                      vmballoon_doorbell, b);
 970
 971         if (error != VMCI_SUCCESS)
 972                 goto fail;
 973
 974         error = VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, b->vmci_doorbell.context,
 975                                    b->vmci_doorbell.resource, dummy);
 976
 977         STATS_INC(b->stats.doorbell_set);
 978
 979         if (error != VMW_BALLOON_SUCCESS)
 980                 goto fail;
 981
 982         return 0;
 983 fail:
 984         vmballoon_vmci_cleanup(b);
 985         return -EIO;
 986 }
 987
 988 /*
 989  * Perform standard reset sequence by popping the balloon (in case it
 990  * is not  empty) and then restarting protocol. This operation normally
 991  * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
 992  */
 993 static void vmballoon_reset(struct vmballoon *b)
 994 {
 995         int error;
 996
 997         vmballoon_vmci_cleanup(b);
 998
 999         /* free all pages, skipping monitor unlock */
1000         vmballoon_pop(b);
1001
1002         if (!vmballoon_send_start(b, VMW_BALLOON_CAPABILITIES))
1003                 return;
1004
1005         if ((b->capabilities & VMW_BALLOON_BATCHED_CMDS) != 0) {
1006                 b->ops = &vmballoon_batched_ops;
1007                 b->batch_max_pages = VMW_BALLOON_BATCH_MAX_PAGES;
1008                 if (!vmballoon_init_batching(b)) {
1009                         /*
1010                          * We failed to initialize batching, inform the monitor
1011                          * about it by sending a null capability.
1012                          *
1013                          * The guest will retry in one second.
1014                          */
1015                         vmballoon_send_start(b, 0);
1016                         return;
1017                 }
1018         } else if ((b->capabilities & VMW_BALLOON_BASIC_CMDS) != 0) {
1019                 b->ops = &vmballoon_basic_ops;
1020                 b->batch_max_pages = 1;
1021         }
1022
1023         b->reset_required = false;
1024
1025         error = vmballoon_vmci_init(b);
1026         if (error)
1027                 pr_err("failed to initialize vmci doorbell\n");
1028
1029         if (!vmballoon_send_guest_id(b))
1030                 pr_err("failed to send guest ID to the host\n");
1031 }
1032
1033 /*
1034  * Balloon work function: reset protocol, if needed, get the new size and
1035  * adjust balloon as needed. Repeat in 1 sec.
1036  */
1037 static void vmballoon_work(struct work_struct *work)
1038 {
1039         struct delayed_work *dwork = to_delayed_work(work);
1040         struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
1041         unsigned int target;
1042
1043         STATS_INC(b->stats.timer);
1044
1045         if (b->reset_required)
1046                 vmballoon_reset(b);
1047
1048         if (!b->reset_required && vmballoon_send_get_target(b, &target)) {
1049                 /* update target, adjust size */
1050                 b->target = target;
1051
1052                 if (b->size < target)
1053                         vmballoon_inflate(b);
1054                 else if (target == 0 ||
1055                                 b->size > target + vmballoon_page_size(true))
1056                         vmballoon_deflate(b);
1057         }
1058
1059         /*
1060          * We are using a freezable workqueue so that balloon operations are
1061          * stopped while the system transitions to/from sleep/hibernation.
1062          */
1063         queue_delayed_work(system_freezable_wq,
1064                            dwork, round_jiffies_relative(HZ));
1065 }
1066
1067 /*
1068  * DEBUGFS Interface
1069  */
1070 #ifdef CONFIG_DEBUG_FS
1071
1072 static int vmballoon_debug_show(struct seq_file *f, void *offset)
1073 {
1074         struct vmballoon *b = f->private;
1075         struct vmballoon_stats *stats = &b->stats;
1076
1077         /* format capabilities info */
1078         seq_printf(f,
1079                    "balloon capabilities:   %#4x\n"
1080                    "used capabilities:      %#4lx\n"
1081                    "is resetting:           %c\n",
1082                    VMW_BALLOON_CAPABILITIES, b->capabilities,
1083                    b->reset_required ? 'y' : 'n');
1084
1085         /* format size info */
1086         seq_printf(f,
1087                    "target:             %8d pages\n"
1088                    "current:            %8d pages\n",
1089                    b->target, b->size);
1090
1091         seq_printf(f,
1092                    "\n"
1093                    "timer:              %8u\n"
1094                    "doorbell:           %8u\n"
1095                    "start:              %8u (%4u failed)\n"
1096                    "guestType:          %8u (%4u failed)\n"
1097                    "2m-lock:            %8u (%4u failed)\n"
1098                    "lock:               %8u (%4u failed)\n"
1099                    "2m-unlock:          %8u (%4u failed)\n"
1100                    "unlock:             %8u (%4u failed)\n"
1101                    "target:             %8u (%4u failed)\n"
1102                    "prim2mAlloc:        %8u (%4u failed)\n"
1103                    "primNoSleepAlloc:   %8u (%4u failed)\n"
1104                    "primCanSleepAlloc:  %8u (%4u failed)\n"
1105                    "prim2mFree:         %8u\n"
1106                    "primFree:           %8u\n"
1107                    "err2mAlloc:         %8u\n"
1108                    "errAlloc:           %8u\n"
1109                    "err2mFree:          %8u\n"
1110                    "errFree:            %8u\n"
1111                    "doorbellSet:        %8u\n"
1112                    "doorbellUnset:      %8u\n",
1113                    stats->timer,
1114                    stats->doorbell,
1115                    stats->start, stats->start_fail,
1116                    stats->guest_type, stats->guest_type_fail,
1117                    stats->lock[true],  stats->lock_fail[true],
1118                    stats->lock[false],  stats->lock_fail[false],
1119                    stats->unlock[true], stats->unlock_fail[true],
1120                    stats->unlock[false], stats->unlock_fail[false],
1121                    stats->target, stats->target_fail,
1122                    stats->alloc[true], stats->alloc_fail[true],
1123                    stats->alloc[false], stats->alloc_fail[false],
1124                    stats->sleep_alloc, stats->sleep_alloc_fail,
1125                    stats->free[true],
1126                    stats->free[false],
1127                    stats->refused_alloc[true], stats->refused_alloc[false],
1128                    stats->refused_free[true], stats->refused_free[false],
1129                    stats->doorbell_set, stats->doorbell_unset);
1130
1131         return 0;
1132 }
1133
1134 static int vmballoon_debug_open(struct inode *inode, struct file *file)
1135 {
1136         return single_open(file, vmballoon_debug_show, inode->i_private);
1137 }
1138
1139 static const struct file_operations vmballoon_debug_fops = {
1140         .owner          = THIS_MODULE,
1141         .open           = vmballoon_debug_open,
1142         .read           = seq_read,
1143         .llseek         = seq_lseek,
1144         .release        = single_release,
1145 };
1146
1147 static int __init vmballoon_debugfs_init(struct vmballoon *b)
1148 {
1149         int error;
1150
1151         b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
1152                                            &vmballoon_debug_fops);
1153         if (IS_ERR(b->dbg_entry)) {
1154                 error = PTR_ERR(b->dbg_entry);
1155                 pr_err("failed to create debugfs entry, error: %d\n", error);
1156                 return error;
1157         }
1158
1159         return 0;
1160 }
1161
1162 static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
1163 {
1164         debugfs_remove(b->dbg_entry);
1165 }
1166
1167 #else
1168
1169 static inline int vmballoon_debugfs_init(struct vmballoon *b)
1170 {
1171         return 0;
1172 }
1173
1174 static inline void vmballoon_debugfs_exit(struct vmballoon *b)
1175 {
1176 }
1177
1178 #endif  /* CONFIG_DEBUG_FS */
1179
1180 static int __init vmballoon_init(void)
1181 {
1182         int error;
1183         unsigned is_2m_pages;
1184         /*
1185          * Check if we are running on VMware's hypervisor and bail out
1186          * if we are not.
1187          */
1188         if (x86_hyper_type != X86_HYPER_VMWARE)
1189                 return -ENODEV;
1190
1191         for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
1192                         is_2m_pages++) {
1193                 INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].pages);
1194                 INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].refused_pages);
1195         }
1196
1197         INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
1198
1199         error = vmballoon_debugfs_init(&balloon);
1200         if (error)
1201                 return error;
1202
1203         balloon.vmci_doorbell = VMCI_INVALID_HANDLE;
1204         balloon.batch_page = NULL;
1205         balloon.page = NULL;
1206         balloon.reset_required = true;
1207
1208         queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
1209
1210         return 0;
1211 }
1212
1213 /*
1214  * Using late_initcall() instead of module_init() allows the balloon to use the
1215  * VMCI doorbell even when the balloon is built into the kernel. Otherwise the
1216  * VMCI is probed only after the balloon is initialized. If the balloon is used
1217  * as a module, late_initcall() is equivalent to module_init().
1218  */
1219 late_initcall(vmballoon_init);
1220
1221 static void __exit vmballoon_exit(void)
1222 {
1223         vmballoon_vmci_cleanup(&balloon);
1224         cancel_delayed_work_sync(&balloon.dwork);
1225
1226         vmballoon_debugfs_exit(&balloon);
1227
1228         /*
1229          * Deallocate all reserved memory, and reset connection with monitor.
1230          * Reset connection before deallocating memory to avoid potential for
1231          * additional spurious resets from guest touching deallocated pages.
1232          */
1233         vmballoon_send_start(&balloon, 0);
1234         vmballoon_pop(&balloon);
1235 }
1236 module_exit(vmballoon_exit);