arch/x86/kernel/cpu/intel_rdt_rdtgroup.c

   1 /*
   2  * User interface for Resource Alloction in Resource Director Technology(RDT)
   3  *
   4  * Copyright (C) 2016 Intel Corporation
   5  *
   6  * Author: Fenghua Yu <fenghua.yu@intel.com>
   7  *
   8  * This program is free software; you can redistribute it and/or modify it
   9  * under the terms and conditions of the GNU General Public License,
  10  * version 2, as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  15  * more details.
  16  *
  17  * More information about RDT be found in the Intel (R) x86 Architecture
  18  * Software Developer Manual.
  19  */
  20
  21 #define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
  22
  23 #include <linux/cacheinfo.h>
  24 #include <linux/cpu.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/fs.h>
  27 #include <linux/sysfs.h>
  28 #include <linux/kernfs.h>
  29 #include <linux/seq_buf.h>
  30 #include <linux/seq_file.h>
  31 #include <linux/sched/signal.h>
  32 #include <linux/sched/task.h>
  33 #include <linux/slab.h>
  34 #include <linux/task_work.h>
  35
  36 #include <uapi/linux/magic.h>
  37
  38 #include <asm/intel_rdt_sched.h>
  39 #include "intel_rdt.h"
  40
  41 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
  42 DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
  43 DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
  44 static struct kernfs_root *rdt_root;
  45 struct rdtgroup rdtgroup_default;
  46 LIST_HEAD(rdt_all_groups);
  47
  48 /* Kernel fs node for "info" directory under root */
  49 static struct kernfs_node *kn_info;
  50
  51 /* Kernel fs node for "mon_groups" directory under root */
  52 static struct kernfs_node *kn_mongrp;
  53
  54 /* Kernel fs node for "mon_data" directory under root */
  55 static struct kernfs_node *kn_mondata;
  56
  57 static struct seq_buf last_cmd_status;
  58 static char last_cmd_status_buf[512];
  59
  60 struct dentry *debugfs_resctrl;
  61
  62 void rdt_last_cmd_clear(void)
  63 {
  64         lockdep_assert_held(&rdtgroup_mutex);
  65         seq_buf_clear(&last_cmd_status);
  66 }
  67
  68 void rdt_last_cmd_puts(const char *s)
  69 {
  70         lockdep_assert_held(&rdtgroup_mutex);
  71         seq_buf_puts(&last_cmd_status, s);
  72 }
  73
  74 void rdt_last_cmd_printf(const char *fmt, ...)
  75 {
  76         va_list ap;
  77
  78         va_start(ap, fmt);
  79         lockdep_assert_held(&rdtgroup_mutex);
  80         seq_buf_vprintf(&last_cmd_status, fmt, ap);
  81         va_end(ap);
  82 }
  83
  84 /*
  85  * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
  86  * we can keep a bitmap of free CLOSIDs in a single integer.
  87  *
  88  * Using a global CLOSID across all resources has some advantages and
  89  * some drawbacks:
  90  * + We can simply set "current->closid" to assign a task to a resource
  91  *   group.
  92  * + Context switch code can avoid extra memory references deciding which
  93  *   CLOSID to load into the PQR_ASSOC MSR
  94  * - We give up some options in configuring resource groups across multi-socket
  95  *   systems.
  96  * - Our choices on how to configure each resource become progressively more
  97  *   limited as the number of resources grows.
  98  */
  99 static int closid_free_map;
 100 static int closid_free_map_len;
 101
 102 int closids_supported(void)
 103 {
 104         return closid_free_map_len;
 105 }
 106
 107 static void closid_init(void)
 108 {
 109         struct rdt_resource *r;
 110         int rdt_min_closid = 32;
 111
 112         /* Compute rdt_min_closid across all resources */
 113         for_each_alloc_enabled_rdt_resource(r)
 114                 rdt_min_closid = min(rdt_min_closid, r->num_closid);
 115
 116         closid_free_map = BIT_MASK(rdt_min_closid) - 1;
 117
 118         /* CLOSID 0 is always reserved for the default group */
 119         closid_free_map &= ~1;
 120         closid_free_map_len = rdt_min_closid;
 121 }
 122
 123 static int closid_alloc(void)
 124 {
 125         u32 closid = ffs(closid_free_map);
 126
 127         if (closid == 0)
 128                 return -ENOSPC;
 129         closid--;
 130         closid_free_map &= ~(1 << closid);
 131
 132         return closid;
 133 }
 134
 135 void closid_free(int closid)
 136 {
 137         closid_free_map |= 1 << closid;
 138 }
 139
 140 /**
 141  * closid_allocated - test if provided closid is in use
 142  * @closid: closid to be tested
 143  *
 144  * Return: true if @closid is currently associated with a resource group,
 145  * false if @closid is free
 146  */
 147 static bool closid_allocated(unsigned int closid)
 148 {
 149         return (closid_free_map & (1 << closid)) == 0;
 150 }
 151
 152 /**
 153  * rdtgroup_mode_by_closid - Return mode of resource group with closid
 154  * @closid: closid if the resource group
 155  *
 156  * Each resource group is associated with a @closid. Here the mode
 157  * of a resource group can be queried by searching for it using its closid.
 158  *
 159  * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
 160  */
 161 enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
 162 {
 163         struct rdtgroup *rdtgrp;
 164
 165         list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
 166                 if (rdtgrp->closid == closid)
 167                         return rdtgrp->mode;
 168         }
 169
 170         return RDT_NUM_MODES;
 171 }
 172
 173 static const char * const rdt_mode_str[] = {
 174         [RDT_MODE_SHAREABLE]            = "shareable",
 175         [RDT_MODE_EXCLUSIVE]            = "exclusive",
 176         [RDT_MODE_PSEUDO_LOCKSETUP]     = "pseudo-locksetup",
 177         [RDT_MODE_PSEUDO_LOCKED]        = "pseudo-locked",
 178 };
 179
 180 /**
 181  * rdtgroup_mode_str - Return the string representation of mode
 182  * @mode: the resource group mode as &enum rdtgroup_mode
 183  *
 184  * Return: string representation of valid mode, "unknown" otherwise
 185  */
 186 static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
 187 {
 188         if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
 189                 return "unknown";
 190
 191         return rdt_mode_str[mode];
 192 }
 193
 194 /* set uid and gid of rdtgroup dirs and files to that of the creator */
 195 static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
 196 {
 197         struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
 198                                 .ia_uid = current_fsuid(),
 199                                 .ia_gid = current_fsgid(), };
 200
 201         if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
 202             gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
 203                 return 0;
 204
 205         return kernfs_setattr(kn, &iattr);
 206 }
 207
 208 static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
 209 {
 210         struct kernfs_node *kn;
 211         int ret;
 212
 213         kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
 214                                   GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
 215                                   0, rft->kf_ops, rft, NULL, NULL);
 216         if (IS_ERR(kn))
 217                 return PTR_ERR(kn);
 218
 219         ret = rdtgroup_kn_set_ugid(kn);
 220         if (ret) {
 221                 kernfs_remove(kn);
 222                 return ret;
 223         }
 224
 225         return 0;
 226 }
 227
 228 static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
 229 {
 230         struct kernfs_open_file *of = m->private;
 231         struct rftype *rft = of->kn->priv;
 232
 233         if (rft->seq_show)
 234                 return rft->seq_show(of, m, arg);
 235         return 0;
 236 }
 237
 238 static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
 239                                    size_t nbytes, loff_t off)
 240 {
 241         struct rftype *rft = of->kn->priv;
 242
 243         if (rft->write)
 244                 return rft->write(of, buf, nbytes, off);
 245
 246         return -EINVAL;
 247 }
 248
 249 static struct kernfs_ops rdtgroup_kf_single_ops = {
 250         .atomic_write_len       = PAGE_SIZE,
 251         .write                  = rdtgroup_file_write,
 252         .seq_show               = rdtgroup_seqfile_show,
 253 };
 254
 255 static struct kernfs_ops kf_mondata_ops = {
 256         .atomic_write_len       = PAGE_SIZE,
 257         .seq_show               = rdtgroup_mondata_show,
 258 };
 259
 260 static bool is_cpu_list(struct kernfs_open_file *of)
 261 {
 262         struct rftype *rft = of->kn->priv;
 263
 264         return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
 265 }
 266
 267 static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 268                               struct seq_file *s, void *v)
 269 {
 270         struct rdtgroup *rdtgrp;
 271         struct cpumask *mask;
 272         int ret = 0;
 273
 274         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 275
 276         if (rdtgrp) {
 277                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
 278                         if (!rdtgrp->plr->d) {
 279                                 rdt_last_cmd_clear();
 280                                 rdt_last_cmd_puts("Cache domain offline\n");
 281                                 ret = -ENODEV;
 282                         } else {
 283                                 mask = &rdtgrp->plr->d->cpu_mask;
 284                                 seq_printf(s, is_cpu_list(of) ?
 285                                            "%*pbl\n" : "%*pb\n",
 286                                            cpumask_pr_args(mask));
 287                         }
 288                 } else {
 289                         seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
 290                                    cpumask_pr_args(&rdtgrp->cpu_mask));
 291                 }
 292         } else {
 293                 ret = -ENOENT;
 294         }
 295         rdtgroup_kn_unlock(of->kn);
 296
 297         return ret;
 298 }
 299
 300 /*
 301  * This is safe against intel_rdt_sched_in() called from __switch_to()
 302  * because __switch_to() is executed with interrupts disabled. A local call
 303  * from update_closid_rmid() is proteced against __switch_to() because
 304  * preemption is disabled.
 305  */
 306 static void update_cpu_closid_rmid(void *info)
 307 {
 308         struct rdtgroup *r = info;
 309
 310         if (r) {
 311                 this_cpu_write(pqr_state.default_closid, r->closid);
 312                 this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
 313         }
 314
 315         /*
 316          * We cannot unconditionally write the MSR because the current
 317          * executing task might have its own closid selected. Just reuse
 318          * the context switch code.
 319          */
 320         intel_rdt_sched_in();
 321 }
 322
 323 /*
 324  * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
 325  *
 326  * Per task closids/rmids must have been set up before calling this function.
 327  */
 328 static void
 329 update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 330 {
 331         int cpu = get_cpu();
 332
 333         if (cpumask_test_cpu(cpu, cpu_mask))
 334                 update_cpu_closid_rmid(r);
 335         smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
 336         put_cpu();
 337 }
 338
 339 static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 340                           cpumask_var_t tmpmask)
 341 {
 342         struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
 343         struct list_head *head;
 344
 345         /* Check whether cpus belong to parent ctrl group */
 346         cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
 347         if (cpumask_weight(tmpmask)) {
 348                 rdt_last_cmd_puts("can only add CPUs to mongroup that belong to parent\n");
 349                 return -EINVAL;
 350         }
 351
 352         /* Check whether cpus are dropped from this group */
 353         cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 354         if (cpumask_weight(tmpmask)) {
 355                 /* Give any dropped cpus to parent rdtgroup */
 356                 cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
 357                 update_closid_rmid(tmpmask, prgrp);
 358         }
 359
 360         /*
 361          * If we added cpus, remove them from previous group that owned them
 362          * and update per-cpu rmid
 363          */
 364         cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 365         if (cpumask_weight(tmpmask)) {
 366                 head = &prgrp->mon.crdtgrp_list;
 367                 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 368                         if (crgrp == rdtgrp)
 369                                 continue;
 370                         cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
 371                                        tmpmask);
 372                 }
 373                 update_closid_rmid(tmpmask, rdtgrp);
 374         }
 375
 376         /* Done pushing/pulling - update this group with new mask */
 377         cpumask_copy(&rdtgrp->cpu_mask, newmask);
 378
 379         return 0;
 380 }
 381
 382 static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
 383 {
 384         struct rdtgroup *crgrp;
 385
 386         cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
 387         /* update the child mon group masks as well*/
 388         list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
 389                 cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
 390 }
 391
 392 static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 393                            cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
 394 {
 395         struct rdtgroup *r, *crgrp;
 396         struct list_head *head;
 397
 398         /* Check whether cpus are dropped from this group */
 399         cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 400         if (cpumask_weight(tmpmask)) {
 401                 /* Can't drop from default group */
 402                 if (rdtgrp == &rdtgroup_default) {
 403                         rdt_last_cmd_puts("Can't drop CPUs from default group\n");
 404                         return -EINVAL;
 405                 }
 406
 407                 /* Give any dropped cpus to rdtgroup_default */
 408                 cpumask_or(&rdtgroup_default.cpu_mask,
 409                            &rdtgroup_default.cpu_mask, tmpmask);
 410                 update_closid_rmid(tmpmask, &rdtgroup_default);
 411         }
 412
 413         /*
 414          * If we added cpus, remove them from previous group and
 415          * the prev group's child groups that owned them
 416          * and update per-cpu closid/rmid.
 417          */
 418         cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 419         if (cpumask_weight(tmpmask)) {
 420                 list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
 421                         if (r == rdtgrp)
 422                                 continue;
 423                         cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
 424                         if (cpumask_weight(tmpmask1))
 425                                 cpumask_rdtgrp_clear(r, tmpmask1);
 426                 }
 427                 update_closid_rmid(tmpmask, rdtgrp);
 428         }
 429
 430         /* Done pushing/pulling - update this group with new mask */
 431         cpumask_copy(&rdtgrp->cpu_mask, newmask);
 432
 433         /*
 434          * Clear child mon group masks since there is a new parent mask
 435          * now and update the rmid for the cpus the child lost.
 436          */
 437         head = &rdtgrp->mon.crdtgrp_list;
 438         list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 439                 cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
 440                 update_closid_rmid(tmpmask, rdtgrp);
 441                 cpumask_clear(&crgrp->cpu_mask);
 442         }
 443
 444         return 0;
 445 }
 446
 447 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 448                                    char *buf, size_t nbytes, loff_t off)
 449 {
 450         cpumask_var_t tmpmask, newmask, tmpmask1;
 451         struct rdtgroup *rdtgrp;
 452         int ret;
 453
 454         if (!buf)
 455                 return -EINVAL;
 456
 457         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 458                 return -ENOMEM;
 459         if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
 460                 free_cpumask_var(tmpmask);
 461                 return -ENOMEM;
 462         }
 463         if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
 464                 free_cpumask_var(tmpmask);
 465                 free_cpumask_var(newmask);
 466                 return -ENOMEM;
 467         }
 468
 469         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 470         rdt_last_cmd_clear();
 471         if (!rdtgrp) {
 472                 ret = -ENOENT;
 473                 rdt_last_cmd_puts("directory was removed\n");
 474                 goto unlock;
 475         }
 476
 477         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 478             rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 479                 ret = -EINVAL;
 480                 rdt_last_cmd_puts("pseudo-locking in progress\n");
 481                 goto unlock;
 482         }
 483
 484         if (is_cpu_list(of))
 485                 ret = cpulist_parse(buf, newmask);
 486         else
 487                 ret = cpumask_parse(buf, newmask);
 488
 489         if (ret) {
 490                 rdt_last_cmd_puts("bad cpu list/mask\n");
 491                 goto unlock;
 492         }
 493
 494         /* check that user didn't specify any offline cpus */
 495         cpumask_andnot(tmpmask, newmask, cpu_online_mask);
 496         if (cpumask_weight(tmpmask)) {
 497                 ret = -EINVAL;
 498                 rdt_last_cmd_puts("can only assign online cpus\n");
 499                 goto unlock;
 500         }
 501
 502         if (rdtgrp->type == RDTCTRL_GROUP)
 503                 ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
 504         else if (rdtgrp->type == RDTMON_GROUP)
 505                 ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
 506         else
 507                 ret = -EINVAL;
 508
 509 unlock:
 510         rdtgroup_kn_unlock(of->kn);
 511         free_cpumask_var(tmpmask);
 512         free_cpumask_var(newmask);
 513         free_cpumask_var(tmpmask1);
 514
 515         return ret ?: nbytes;
 516 }
 517
 518 /**
 519  * rdtgroup_remove - the helper to remove resource group safely
 520  * @rdtgrp: resource group to remove
 521  *
 522  * On resource group creation via a mkdir, an extra kernfs_node reference is
 523  * taken to ensure that the rdtgroup structure remains accessible for the
 524  * rdtgroup_kn_unlock() calls where it is removed.
 525  *
 526  * Drop the extra reference here, then free the rdtgroup structure.
 527  *
 528  * Return: void
 529  */
 530 static void rdtgroup_remove(struct rdtgroup *rdtgrp)
 531 {
 532         kernfs_put(rdtgrp->kn);
 533         kfree(rdtgrp);
 534 }
 535
 536 static void _update_task_closid_rmid(void *task)
 537 {
 538         /*
 539          * If the task is still current on this CPU, update PQR_ASSOC MSR.
 540          * Otherwise, the MSR is updated when the task is scheduled in.
 541          */
 542         if (task == current)
 543                 intel_rdt_sched_in();
 544 }
 545
 546 static void update_task_closid_rmid(struct task_struct *t)
 547 {
 548         if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
 549                 smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
 550         else
 551                 _update_task_closid_rmid(t);
 552 }
 553
 554 static int __rdtgroup_move_task(struct task_struct *tsk,
 555                                 struct rdtgroup *rdtgrp)
 556 {
 557         /* If the task is already in rdtgrp, no need to move the task. */
 558         if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid &&
 559              tsk->rmid == rdtgrp->mon.rmid) ||
 560             (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid &&
 561              tsk->closid == rdtgrp->mon.parent->closid))
 562                 return 0;
 563
 564         /*
 565          * Set the task's closid/rmid before the PQR_ASSOC MSR can be
 566          * updated by them.
 567          *
 568          * For ctrl_mon groups, move both closid and rmid.
 569          * For monitor groups, can move the tasks only from
 570          * their parent CTRL group.
 571          */
 572
 573         if (rdtgrp->type == RDTCTRL_GROUP) {
 574                 tsk->closid = rdtgrp->closid;
 575                 tsk->rmid = rdtgrp->mon.rmid;
 576         } else if (rdtgrp->type == RDTMON_GROUP) {
 577                 if (rdtgrp->mon.parent->closid == tsk->closid) {
 578                         tsk->rmid = rdtgrp->mon.rmid;
 579                 } else {
 580                         rdt_last_cmd_puts("Can't move task to different control group\n");
 581                         return -EINVAL;
 582                 }
 583         }
 584
 585         /*
 586          * Ensure the task's closid and rmid are written before determining if
 587          * the task is current that will decide if it will be interrupted.
 588          * This pairs with the full barrier between the rq->curr update and
 589          * resctrl_sched_in() during context switch.
 590          */
 591         smp_mb();
 592
 593         /*
 594          * By now, the task's closid and rmid are set. If the task is current
 595          * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
 596          * group go into effect. If the task is not current, the MSR will be
 597          * updated when the task is scheduled in.
 598          */
 599         update_task_closid_rmid(tsk);
 600
 601         return 0;
 602 }
 603
 604 /**
 605  * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
 606  * @r: Resource group
 607  *
 608  * Return: 1 if tasks have been assigned to @r, 0 otherwise
 609  */
 610 int rdtgroup_tasks_assigned(struct rdtgroup *r)
 611 {
 612         struct task_struct *p, *t;
 613         int ret = 0;
 614
 615         lockdep_assert_held(&rdtgroup_mutex);
 616
 617         rcu_read_lock();
 618         for_each_process_thread(p, t) {
 619                 if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
 620                     (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) {
 621                         ret = 1;
 622                         break;
 623                 }
 624         }
 625         rcu_read_unlock();
 626
 627         return ret;
 628 }
 629
 630 static int rdtgroup_task_write_permission(struct task_struct *task,
 631                                           struct kernfs_open_file *of)
 632 {
 633         const struct cred *tcred = get_task_cred(task);
 634         const struct cred *cred = current_cred();
 635         int ret = 0;
 636
 637         /*
 638          * Even if we're attaching all tasks in the thread group, we only
 639          * need to check permissions on one of them.
 640          */
 641         if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 642             !uid_eq(cred->euid, tcred->uid) &&
 643             !uid_eq(cred->euid, tcred->suid)) {
 644                 rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
 645                 ret = -EPERM;
 646         }
 647
 648         put_cred(tcred);
 649         return ret;
 650 }
 651
 652 static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
 653                               struct kernfs_open_file *of)
 654 {
 655         struct task_struct *tsk;
 656         int ret;
 657
 658         rcu_read_lock();
 659         if (pid) {
 660                 tsk = find_task_by_vpid(pid);
 661                 if (!tsk) {
 662                         rcu_read_unlock();
 663                         rdt_last_cmd_printf("No task %d\n", pid);
 664                         return -ESRCH;
 665                 }
 666         } else {
 667                 tsk = current;
 668         }
 669
 670         get_task_struct(tsk);
 671         rcu_read_unlock();
 672
 673         ret = rdtgroup_task_write_permission(tsk, of);
 674         if (!ret)
 675                 ret = __rdtgroup_move_task(tsk, rdtgrp);
 676
 677         put_task_struct(tsk);
 678         return ret;
 679 }
 680
 681 static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
 682                                     char *buf, size_t nbytes, loff_t off)
 683 {
 684         struct rdtgroup *rdtgrp;
 685         int ret = 0;
 686         pid_t pid;
 687
 688         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
 689                 return -EINVAL;
 690         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 691         if (!rdtgrp) {
 692                 rdtgroup_kn_unlock(of->kn);
 693                 return -ENOENT;
 694         }
 695         rdt_last_cmd_clear();
 696
 697         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 698             rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 699                 ret = -EINVAL;
 700                 rdt_last_cmd_puts("pseudo-locking in progress\n");
 701                 goto unlock;
 702         }
 703
 704         ret = rdtgroup_move_task(pid, rdtgrp, of);
 705
 706 unlock:
 707         rdtgroup_kn_unlock(of->kn);
 708
 709         return ret ?: nbytes;
 710 }
 711
 712 static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 713 {
 714         struct task_struct *p, *t;
 715
 716         rcu_read_lock();
 717         for_each_process_thread(p, t) {
 718                 if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
 719                     (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
 720                         seq_printf(s, "%d\n", t->pid);
 721         }
 722         rcu_read_unlock();
 723 }
 724
 725 static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 726                                struct seq_file *s, void *v)
 727 {
 728         struct rdtgroup *rdtgrp;
 729         int ret = 0;
 730
 731         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 732         if (rdtgrp)
 733                 show_rdt_tasks(rdtgrp, s);
 734         else
 735                 ret = -ENOENT;
 736         rdtgroup_kn_unlock(of->kn);
 737
 738         return ret;
 739 }
 740
 741 static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
 742                                     struct seq_file *seq, void *v)
 743 {
 744         int len;
 745
 746         mutex_lock(&rdtgroup_mutex);
 747         len = seq_buf_used(&last_cmd_status);
 748         if (len)
 749                 seq_printf(seq, "%.*s", len, last_cmd_status_buf);
 750         else
 751                 seq_puts(seq, "ok\n");
 752         mutex_unlock(&rdtgroup_mutex);
 753         return 0;
 754 }
 755
 756 static int rdt_num_closids_show(struct kernfs_open_file *of,
 757                                 struct seq_file *seq, void *v)
 758 {
 759         struct rdt_resource *r = of->kn->parent->priv;
 760
 761         seq_printf(seq, "%d\n", r->num_closid);
 762         return 0;
 763 }
 764
 765 static int rdt_default_ctrl_show(struct kernfs_open_file *of,
 766                              struct seq_file *seq, void *v)
 767 {
 768         struct rdt_resource *r = of->kn->parent->priv;
 769
 770         seq_printf(seq, "%x\n", r->default_ctrl);
 771         return 0;
 772 }
 773
 774 static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 775                              struct seq_file *seq, void *v)
 776 {
 777         struct rdt_resource *r = of->kn->parent->priv;
 778
 779         seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
 780         return 0;
 781 }
 782
 783 static int rdt_shareable_bits_show(struct kernfs_open_file *of,
 784                                    struct seq_file *seq, void *v)
 785 {
 786         struct rdt_resource *r = of->kn->parent->priv;
 787
 788         seq_printf(seq, "%x\n", r->cache.shareable_bits);
 789         return 0;
 790 }
 791
 792 /**
 793  * rdt_bit_usage_show - Display current usage of resources
 794  *
 795  * A domain is a shared resource that can now be allocated differently. Here
 796  * we display the current regions of the domain as an annotated bitmask.
 797  * For each domain of this resource its allocation bitmask
 798  * is annotated as below to indicate the current usage of the corresponding bit:
 799  *   0 - currently unused
 800  *   X - currently available for sharing and used by software and hardware
 801  *   H - currently used by hardware only but available for software use
 802  *   S - currently used and shareable by software only
 803  *   E - currently used exclusively by one resource group
 804  *   P - currently pseudo-locked by one resource group
 805  */
 806 static int rdt_bit_usage_show(struct kernfs_open_file *of,
 807                               struct seq_file *seq, void *v)
 808 {
 809         struct rdt_resource *r = of->kn->parent->priv;
 810         /*
 811          * Use unsigned long even though only 32 bits are used to ensure
 812          * test_bit() is used safely.
 813          */
 814         unsigned long sw_shareable = 0, hw_shareable = 0;
 815         unsigned long exclusive = 0, pseudo_locked = 0;
 816         struct rdt_domain *dom;
 817         int i, hwb, swb, excl, psl;
 818         enum rdtgrp_mode mode;
 819         bool sep = false;
 820         u32 *ctrl;
 821
 822         mutex_lock(&rdtgroup_mutex);
 823         hw_shareable = r->cache.shareable_bits;
 824         list_for_each_entry(dom, &r->domains, list) {
 825                 if (sep)
 826                         seq_putc(seq, ';');
 827                 ctrl = dom->ctrl_val;
 828                 sw_shareable = 0;
 829                 exclusive = 0;
 830                 seq_printf(seq, "%d=", dom->id);
 831                 for (i = 0; i < closids_supported(); i++, ctrl++) {
 832                         if (!closid_allocated(i))
 833                                 continue;
 834                         mode = rdtgroup_mode_by_closid(i);
 835                         switch (mode) {
 836                         case RDT_MODE_SHAREABLE:
 837                                 sw_shareable |= *ctrl;
 838                                 break;
 839                         case RDT_MODE_EXCLUSIVE:
 840                                 exclusive |= *ctrl;
 841                                 break;
 842                         case RDT_MODE_PSEUDO_LOCKSETUP:
 843                         /*
 844                          * RDT_MODE_PSEUDO_LOCKSETUP is possible
 845                          * here but not included since the CBM
 846                          * associated with this CLOSID in this mode
 847                          * is not initialized and no task or cpu can be
 848                          * assigned this CLOSID.
 849                          */
 850                                 break;
 851                         case RDT_MODE_PSEUDO_LOCKED:
 852                         case RDT_NUM_MODES:
 853                                 WARN(1,
 854                                      "invalid mode for closid %d\n", i);
 855                                 break;
 856                         }
 857                 }
 858                 for (i = r->cache.cbm_len - 1; i >= 0; i--) {
 859                         pseudo_locked = dom->plr ? dom->plr->cbm : 0;
 860                         hwb = test_bit(i, &hw_shareable);
 861                         swb = test_bit(i, &sw_shareable);
 862                         excl = test_bit(i, &exclusive);
 863                         psl = test_bit(i, &pseudo_locked);
 864                         if (hwb && swb)
 865                                 seq_putc(seq, 'X');
 866                         else if (hwb && !swb)
 867                                 seq_putc(seq, 'H');
 868                         else if (!hwb && swb)
 869                                 seq_putc(seq, 'S');
 870                         else if (excl)
 871                                 seq_putc(seq, 'E');
 872                         else if (psl)
 873                                 seq_putc(seq, 'P');
 874                         else /* Unused bits remain */
 875                                 seq_putc(seq, '0');
 876                 }
 877                 sep = true;
 878         }
 879         seq_putc(seq, '\n');
 880         mutex_unlock(&rdtgroup_mutex);
 881         return 0;
 882 }
 883
 884 static int rdt_min_bw_show(struct kernfs_open_file *of,
 885                              struct seq_file *seq, void *v)
 886 {
 887         struct rdt_resource *r = of->kn->parent->priv;
 888
 889         seq_printf(seq, "%u\n", r->membw.min_bw);
 890         return 0;
 891 }
 892
 893 static int rdt_num_rmids_show(struct kernfs_open_file *of,
 894                               struct seq_file *seq, void *v)
 895 {
 896         struct rdt_resource *r = of->kn->parent->priv;
 897
 898         seq_printf(seq, "%d\n", r->num_rmid);
 899
 900         return 0;
 901 }
 902
 903 static int rdt_mon_features_show(struct kernfs_open_file *of,
 904                                  struct seq_file *seq, void *v)
 905 {
 906         struct rdt_resource *r = of->kn->parent->priv;
 907         struct mon_evt *mevt;
 908
 909         list_for_each_entry(mevt, &r->evt_list, list)
 910                 seq_printf(seq, "%s\n", mevt->name);
 911
 912         return 0;
 913 }
 914
 915 static int rdt_bw_gran_show(struct kernfs_open_file *of,
 916                              struct seq_file *seq, void *v)
 917 {
 918         struct rdt_resource *r = of->kn->parent->priv;
 919
 920         seq_printf(seq, "%u\n", r->membw.bw_gran);
 921         return 0;
 922 }
 923
 924 static int rdt_delay_linear_show(struct kernfs_open_file *of,
 925                              struct seq_file *seq, void *v)
 926 {
 927         struct rdt_resource *r = of->kn->parent->priv;
 928
 929         seq_printf(seq, "%u\n", r->membw.delay_linear);
 930         return 0;
 931 }
 932
 933 static int max_threshold_occ_show(struct kernfs_open_file *of,
 934                                   struct seq_file *seq, void *v)
 935 {
 936         struct rdt_resource *r = of->kn->parent->priv;
 937
 938         seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
 939
 940         return 0;
 941 }
 942
 943 static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 944                                        char *buf, size_t nbytes, loff_t off)
 945 {
 946         struct rdt_resource *r = of->kn->parent->priv;
 947         unsigned int bytes;
 948         int ret;
 949
 950         ret = kstrtouint(buf, 0, &bytes);
 951         if (ret)
 952                 return ret;
 953
 954         if (bytes > (boot_cpu_data.x86_cache_size * 1024))
 955                 return -EINVAL;
 956
 957         intel_cqm_threshold = bytes / r->mon_scale;
 958
 959         return nbytes;
 960 }
 961
 962 /*
 963  * rdtgroup_mode_show - Display mode of this resource group
 964  */
 965 static int rdtgroup_mode_show(struct kernfs_open_file *of,
 966                               struct seq_file *s, void *v)
 967 {
 968         struct rdtgroup *rdtgrp;
 969
 970         rdtgrp = rdtgroup_kn_lock_live(of->kn);
 971         if (!rdtgrp) {
 972                 rdtgroup_kn_unlock(of->kn);
 973                 return -ENOENT;
 974         }
 975
 976         seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
 977
 978         rdtgroup_kn_unlock(of->kn);
 979         return 0;
 980 }
 981
 982 /**
 983  * rdt_cdp_peer_get - Retrieve CDP peer if it exists
 984  * @r: RDT resource to which RDT domain @d belongs
 985  * @d: Cache instance for which a CDP peer is requested
 986  * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer)
 987  *         Used to return the result.
 988  * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer)
 989  *         Used to return the result.
 990  *
 991  * RDT resources are managed independently and by extension the RDT domains
 992  * (RDT resource instances) are managed independently also. The Code and
 993  * Data Prioritization (CDP) RDT resources, while managed independently,
 994  * could refer to the same underlying hardware. For example,
 995  * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache.
 996  *
 997  * When provided with an RDT resource @r and an instance of that RDT
 998  * resource @d rdt_cdp_peer_get() will return if there is a peer RDT
 999  * resource and the exact instance that shares the same hardware.
1000  *
1001  * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists.
1002  *         If a CDP peer was found, @r_cdp will point to the peer RDT resource
1003  *         and @d_cdp will point to the peer RDT domain.
1004  */
1005 static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
1006                             struct rdt_resource **r_cdp,
1007                             struct rdt_domain **d_cdp)
1008 {
1009         struct rdt_resource *_r_cdp = NULL;
1010         struct rdt_domain *_d_cdp = NULL;
1011         int ret = 0;
1012
1013         switch (r->rid) {
1014         case RDT_RESOURCE_L3DATA:
1015                 _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE];
1016                 break;
1017         case RDT_RESOURCE_L3CODE:
1018                 _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L3DATA];
1019                 break;
1020         case RDT_RESOURCE_L2DATA:
1021                 _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2CODE];
1022                 break;
1023         case RDT_RESOURCE_L2CODE:
1024                 _r_cdp =  &rdt_resources_all[RDT_RESOURCE_L2DATA];
1025                 break;
1026         default:
1027                 ret = -ENOENT;
1028                 goto out;
1029         }
1030
1031         /*
1032          * When a new CPU comes online and CDP is enabled then the new
1033          * RDT domains (if any) associated with both CDP RDT resources
1034          * are added in the same CPU online routine while the
1035          * rdtgroup_mutex is held. It should thus not happen for one
1036          * RDT domain to exist and be associated with its RDT CDP
1037          * resource but there is no RDT domain associated with the
1038          * peer RDT CDP resource. Hence the WARN.
1039          */
1040         _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
1041         if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) {
1042                 _r_cdp = NULL;
1043                 _d_cdp = NULL;
1044                 ret = -EINVAL;
1045         }
1046
1047 out:
1048         *r_cdp = _r_cdp;
1049         *d_cdp = _d_cdp;
1050
1051         return ret;
1052 }
1053
1054 /**
1055  * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
1056  * @r: Resource to which domain instance @d belongs.
1057  * @d: The domain instance for which @closid is being tested.
1058  * @cbm: Capacity bitmask being tested.
1059  * @closid: Intended closid for @cbm.
1060  * @exclusive: Only check if overlaps with exclusive resource groups
1061  *
1062  * Checks if provided @cbm intended to be used for @closid on domain
1063  * @d overlaps with any other closids or other hardware usage associated
1064  * with this domain. If @exclusive is true then only overlaps with
1065  * resource groups in exclusive mode will be considered. If @exclusive
1066  * is false then overlaps with any resource group or hardware entities
1067  * will be considered.
1068  *
1069  * @cbm is unsigned long, even if only 32 bits are used, to make the
1070  * bitmap functions work correctly.
1071  *
1072  * Return: false if CBM does not overlap, true if it does.
1073  */
1074 static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
1075                                     unsigned long cbm, int closid, bool exclusive)
1076 {
1077         enum rdtgrp_mode mode;
1078         unsigned long ctrl_b;
1079         u32 *ctrl;
1080         int i;
1081
1082         /* Check for any overlap with regions used by hardware directly */
1083         if (!exclusive) {
1084                 ctrl_b = r->cache.shareable_bits;
1085                 if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
1086                         return true;
1087         }
1088
1089         /* Check for overlap with other resource groups */
1090         ctrl = d->ctrl_val;
1091         for (i = 0; i < closids_supported(); i++, ctrl++) {
1092                 ctrl_b = *ctrl;
1093                 mode = rdtgroup_mode_by_closid(i);
1094                 if (closid_allocated(i) && i != closid &&
1095                     mode != RDT_MODE_PSEUDO_LOCKSETUP) {
1096                         if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
1097                                 if (exclusive) {
1098                                         if (mode == RDT_MODE_EXCLUSIVE)
1099                                                 return true;
1100                                         continue;
1101                                 }
1102                                 return true;
1103                         }
1104                 }
1105         }
1106
1107         return false;
1108 }
1109
1110 /**
1111  * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
1112  * @r: Resource to which domain instance @d belongs.
1113  * @d: The domain instance for which @closid is being tested.
1114  * @cbm: Capacity bitmask being tested.
1115  * @closid: Intended closid for @cbm.
1116  * @exclusive: Only check if overlaps with exclusive resource groups
1117  *
1118  * Resources that can be allocated using a CBM can use the CBM to control
1119  * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
1120  * for overlap. Overlap test is not limited to the specific resource for
1121  * which the CBM is intended though - when dealing with CDP resources that
1122  * share the underlying hardware the overlap check should be performed on
1123  * the CDP resource sharing the hardware also.
1124  *
1125  * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
1126  * overlap test.
1127  *
1128  * Return: true if CBM overlap detected, false if there is no overlap
1129  */
1130 bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
1131                            unsigned long cbm, int closid, bool exclusive)
1132 {
1133         struct rdt_resource *r_cdp;
1134         struct rdt_domain *d_cdp;
1135
1136         if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive))
1137                 return true;
1138
1139         if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0)
1140                 return false;
1141
1142         return  __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive);
1143 }
1144
1145 /**
1146  * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
1147  *
1148  * An exclusive resource group implies that there should be no sharing of
1149  * its allocated resources. At the time this group is considered to be
1150  * exclusive this test can determine if its current schemata supports this
1151  * setting by testing for overlap with all other resource groups.
1152  *
1153  * Return: true if resource group can be exclusive, false if there is overlap
1154  * with allocations of other resource groups and thus this resource group
1155  * cannot be exclusive.
1156  */
1157 static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
1158 {
1159         int closid = rdtgrp->closid;
1160         struct rdt_resource *r;
1161         bool has_cache = false;
1162         struct rdt_domain *d;
1163
1164         for_each_alloc_enabled_rdt_resource(r) {
1165                 if (r->rid == RDT_RESOURCE_MBA)
1166                         continue;
1167                 has_cache = true;
1168                 list_for_each_entry(d, &r->domains, list) {
1169                         if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
1170                                                   rdtgrp->closid, false)) {
1171                                 rdt_last_cmd_puts("schemata overlaps\n");
1172                                 return false;
1173                         }
1174                 }
1175         }
1176
1177         if (!has_cache) {
1178                 rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n");
1179                 return false;
1180         }
1181
1182         return true;
1183 }
1184
1185 /**
1186  * rdtgroup_mode_write - Modify the resource group's mode
1187  *
1188  */
1189 static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
1190                                    char *buf, size_t nbytes, loff_t off)
1191 {
1192         struct rdtgroup *rdtgrp;
1193         enum rdtgrp_mode mode;
1194         int ret = 0;
1195
1196         /* Valid input requires a trailing newline */
1197         if (nbytes == 0 || buf[nbytes - 1] != '\n')
1198                 return -EINVAL;
1199         buf[nbytes - 1] = '\0';
1200
1201         rdtgrp = rdtgroup_kn_lock_live(of->kn);
1202         if (!rdtgrp) {
1203                 rdtgroup_kn_unlock(of->kn);
1204                 return -ENOENT;
1205         }
1206
1207         rdt_last_cmd_clear();
1208
1209         mode = rdtgrp->mode;
1210
1211         if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
1212             (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
1213             (!strcmp(buf, "pseudo-locksetup") &&
1214              mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
1215             (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
1216                 goto out;
1217
1218         if (mode == RDT_MODE_PSEUDO_LOCKED) {
1219                 rdt_last_cmd_printf("cannot change pseudo-locked group\n");
1220                 ret = -EINVAL;
1221                 goto out;
1222         }
1223
1224         if (!strcmp(buf, "shareable")) {
1225                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1226                         ret = rdtgroup_locksetup_exit(rdtgrp);
1227                         if (ret)
1228                                 goto out;
1229                 }
1230                 rdtgrp->mode = RDT_MODE_SHAREABLE;
1231         } else if (!strcmp(buf, "exclusive")) {
1232                 if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
1233                         ret = -EINVAL;
1234                         goto out;
1235                 }
1236                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1237                         ret = rdtgroup_locksetup_exit(rdtgrp);
1238                         if (ret)
1239                                 goto out;
1240                 }
1241                 rdtgrp->mode = RDT_MODE_EXCLUSIVE;
1242         } else if (!strcmp(buf, "pseudo-locksetup")) {
1243                 ret = rdtgroup_locksetup_enter(rdtgrp);
1244                 if (ret)
1245                         goto out;
1246                 rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
1247         } else {
1248                 rdt_last_cmd_printf("unknown/unsupported mode\n");
1249                 ret = -EINVAL;
1250         }
1251
1252 out:
1253         rdtgroup_kn_unlock(of->kn);
1254         return ret ?: nbytes;
1255 }
1256
1257 /**
1258  * rdtgroup_cbm_to_size - Translate CBM to size in bytes
1259  * @r: RDT resource to which @d belongs.
1260  * @d: RDT domain instance.
1261  * @cbm: bitmask for which the size should be computed.
1262  *
1263  * The bitmask provided associated with the RDT domain instance @d will be
1264  * translated into how many bytes it represents. The size in bytes is
1265  * computed by first dividing the total cache size by the CBM length to
1266  * determine how many bytes each bit in the bitmask represents. The result
1267  * is multiplied with the number of bits set in the bitmask.
1268  *
1269  * @cbm is unsigned long, even if only 32 bits are used to make the
1270  * bitmap functions work correctly.
1271  */
1272 unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
1273                                   struct rdt_domain *d, unsigned long cbm)
1274 {
1275         struct cpu_cacheinfo *ci;
1276         unsigned int size = 0;
1277         int num_b, i;
1278
1279         num_b = bitmap_weight(&cbm, r->cache.cbm_len);
1280         ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
1281         for (i = 0; i < ci->num_leaves; i++) {
1282                 if (ci->info_list[i].level == r->cache_level) {
1283                         size = ci->info_list[i].size / r->cache.cbm_len * num_b;
1284                         break;
1285                 }
1286         }
1287
1288         return size;
1289 }
1290
1291 /**
1292  * rdtgroup_size_show - Display size in bytes of allocated regions
1293  *
1294  * The "size" file mirrors the layout of the "schemata" file, printing the
1295  * size in bytes of each region instead of the capacity bitmask.
1296  *
1297  */
1298 static int rdtgroup_size_show(struct kernfs_open_file *of,
1299                               struct seq_file *s, void *v)
1300 {
1301         struct rdtgroup *rdtgrp;
1302         struct rdt_resource *r;
1303         struct rdt_domain *d;
1304         unsigned int size;
1305         int ret = 0;
1306         bool sep;
1307         u32 ctrl;
1308
1309         rdtgrp = rdtgroup_kn_lock_live(of->kn);
1310         if (!rdtgrp) {
1311                 rdtgroup_kn_unlock(of->kn);
1312                 return -ENOENT;
1313         }
1314
1315         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
1316                 if (!rdtgrp->plr->d) {
1317                         rdt_last_cmd_clear();
1318                         rdt_last_cmd_puts("Cache domain offline\n");
1319                         ret = -ENODEV;
1320                 } else {
1321                         seq_printf(s, "%*s:", max_name_width,
1322                                    rdtgrp->plr->r->name);
1323                         size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
1324                                                     rdtgrp->plr->d,
1325                                                     rdtgrp->plr->cbm);
1326                         seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
1327                 }
1328                 goto out;
1329         }
1330
1331         for_each_alloc_enabled_rdt_resource(r) {
1332                 sep = false;
1333                 seq_printf(s, "%*s:", max_name_width, r->name);
1334                 list_for_each_entry(d, &r->domains, list) {
1335                         if (sep)
1336                                 seq_putc(s, ';');
1337                         if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1338                                 size = 0;
1339                         } else {
1340                                 ctrl = (!is_mba_sc(r) ?
1341                                                 d->ctrl_val[rdtgrp->closid] :
1342                                                 d->mbps_val[rdtgrp->closid]);
1343                                 if (r->rid == RDT_RESOURCE_MBA)
1344                                         size = ctrl;
1345                                 else
1346                                         size = rdtgroup_cbm_to_size(r, d, ctrl);
1347                         }
1348                         seq_printf(s, "%d=%u", d->id, size);
1349                         sep = true;
1350                 }
1351                 seq_putc(s, '\n');
1352         }
1353
1354 out:
1355         rdtgroup_kn_unlock(of->kn);
1356
1357         return ret;
1358 }
1359
1360 /* rdtgroup information files for one cache resource. */
1361 static struct rftype res_common_files[] = {
1362         {
1363                 .name           = "last_cmd_status",
1364                 .mode           = 0444,
1365                 .kf_ops         = &rdtgroup_kf_single_ops,
1366                 .seq_show       = rdt_last_cmd_status_show,
1367                 .fflags         = RF_TOP_INFO,
1368         },
1369         {
1370                 .name           = "num_closids",
1371                 .mode           = 0444,
1372                 .kf_ops         = &rdtgroup_kf_single_ops,
1373                 .seq_show       = rdt_num_closids_show,
1374                 .fflags         = RF_CTRL_INFO,
1375         },
1376         {
1377                 .name           = "mon_features",
1378                 .mode           = 0444,
1379                 .kf_ops         = &rdtgroup_kf_single_ops,
1380                 .seq_show       = rdt_mon_features_show,
1381                 .fflags         = RF_MON_INFO,
1382         },
1383         {
1384                 .name           = "num_rmids",
1385                 .mode           = 0444,
1386                 .kf_ops         = &rdtgroup_kf_single_ops,
1387                 .seq_show       = rdt_num_rmids_show,
1388                 .fflags         = RF_MON_INFO,
1389         },
1390         {
1391                 .name           = "cbm_mask",
1392                 .mode           = 0444,
1393                 .kf_ops         = &rdtgroup_kf_single_ops,
1394                 .seq_show       = rdt_default_ctrl_show,
1395                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1396         },
1397         {
1398                 .name           = "min_cbm_bits",
1399                 .mode           = 0444,
1400                 .kf_ops         = &rdtgroup_kf_single_ops,
1401                 .seq_show       = rdt_min_cbm_bits_show,
1402                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1403         },
1404         {
1405                 .name           = "shareable_bits",
1406                 .mode           = 0444,
1407                 .kf_ops         = &rdtgroup_kf_single_ops,
1408                 .seq_show       = rdt_shareable_bits_show,
1409                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1410         },
1411         {
1412                 .name           = "bit_usage",
1413                 .mode           = 0444,
1414                 .kf_ops         = &rdtgroup_kf_single_ops,
1415                 .seq_show       = rdt_bit_usage_show,
1416                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_CACHE,
1417         },
1418         {
1419                 .name           = "min_bandwidth",
1420                 .mode           = 0444,
1421                 .kf_ops         = &rdtgroup_kf_single_ops,
1422                 .seq_show       = rdt_min_bw_show,
1423                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1424         },
1425         {
1426                 .name           = "bandwidth_gran",
1427                 .mode           = 0444,
1428                 .kf_ops         = &rdtgroup_kf_single_ops,
1429                 .seq_show       = rdt_bw_gran_show,
1430                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1431         },
1432         {
1433                 .name           = "delay_linear",
1434                 .mode           = 0444,
1435                 .kf_ops         = &rdtgroup_kf_single_ops,
1436                 .seq_show       = rdt_delay_linear_show,
1437                 .fflags         = RF_CTRL_INFO | RFTYPE_RES_MB,
1438         },
1439         {
1440                 .name           = "max_threshold_occupancy",
1441                 .mode           = 0644,
1442                 .kf_ops         = &rdtgroup_kf_single_ops,
1443                 .write          = max_threshold_occ_write,
1444                 .seq_show       = max_threshold_occ_show,
1445                 .fflags         = RF_MON_INFO | RFTYPE_RES_CACHE,
1446         },
1447         {
1448                 .name           = "cpus",
1449                 .mode           = 0644,
1450                 .kf_ops         = &rdtgroup_kf_single_ops,
1451                 .write          = rdtgroup_cpus_write,
1452                 .seq_show       = rdtgroup_cpus_show,
1453                 .fflags         = RFTYPE_BASE,
1454         },
1455         {
1456                 .name           = "cpus_list",
1457                 .mode           = 0644,
1458                 .kf_ops         = &rdtgroup_kf_single_ops,
1459                 .write          = rdtgroup_cpus_write,
1460                 .seq_show       = rdtgroup_cpus_show,
1461                 .flags          = RFTYPE_FLAGS_CPUS_LIST,
1462                 .fflags         = RFTYPE_BASE,
1463         },
1464         {
1465                 .name           = "tasks",
1466                 .mode           = 0644,
1467                 .kf_ops         = &rdtgroup_kf_single_ops,
1468                 .write          = rdtgroup_tasks_write,
1469                 .seq_show       = rdtgroup_tasks_show,
1470                 .fflags         = RFTYPE_BASE,
1471         },
1472         {
1473                 .name           = "schemata",
1474                 .mode           = 0644,
1475                 .kf_ops         = &rdtgroup_kf_single_ops,
1476                 .write          = rdtgroup_schemata_write,
1477                 .seq_show       = rdtgroup_schemata_show,
1478                 .fflags         = RF_CTRL_BASE,
1479         },
1480         {
1481                 .name           = "mode",
1482                 .mode           = 0644,
1483                 .kf_ops         = &rdtgroup_kf_single_ops,
1484                 .write          = rdtgroup_mode_write,
1485                 .seq_show       = rdtgroup_mode_show,
1486                 .fflags         = RF_CTRL_BASE,
1487         },
1488         {
1489                 .name           = "size",
1490                 .mode           = 0444,
1491                 .kf_ops         = &rdtgroup_kf_single_ops,
1492                 .seq_show       = rdtgroup_size_show,
1493                 .fflags         = RF_CTRL_BASE,
1494         },
1495
1496 };
1497
1498 static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
1499 {
1500         struct rftype *rfts, *rft;
1501         int ret, len;
1502
1503         rfts = res_common_files;
1504         len = ARRAY_SIZE(res_common_files);
1505
1506         lockdep_assert_held(&rdtgroup_mutex);
1507
1508         for (rft = rfts; rft < rfts + len; rft++) {
1509                 if ((fflags & rft->fflags) == rft->fflags) {
1510                         ret = rdtgroup_add_file(kn, rft);
1511                         if (ret)
1512                                 goto error;
1513                 }
1514         }
1515
1516         return 0;
1517 error:
1518         pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
1519         while (--rft >= rfts) {
1520                 if ((fflags & rft->fflags) == rft->fflags)
1521                         kernfs_remove_by_name(kn, rft->name);
1522         }
1523         return ret;
1524 }
1525
1526 /**
1527  * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
1528  * @r: The resource group with which the file is associated.
1529  * @name: Name of the file
1530  *
1531  * The permissions of named resctrl file, directory, or link are modified
1532  * to not allow read, write, or execute by any user.
1533  *
1534  * WARNING: This function is intended to communicate to the user that the
1535  * resctrl file has been locked down - that it is not relevant to the
1536  * particular state the system finds itself in. It should not be relied
1537  * on to protect from user access because after the file's permissions
1538  * are restricted the user can still change the permissions using chmod
1539  * from the command line.
1540  *
1541  * Return: 0 on success, <0 on failure.
1542  */
1543 int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
1544 {
1545         struct iattr iattr = {.ia_valid = ATTR_MODE,};
1546         struct kernfs_node *kn;
1547         int ret = 0;
1548
1549         kn = kernfs_find_and_get_ns(r->kn, name, NULL);
1550         if (!kn)
1551                 return -ENOENT;
1552
1553         switch (kernfs_type(kn)) {
1554         case KERNFS_DIR:
1555                 iattr.ia_mode = S_IFDIR;
1556                 break;
1557         case KERNFS_FILE:
1558                 iattr.ia_mode = S_IFREG;
1559                 break;
1560         case KERNFS_LINK:
1561                 iattr.ia_mode = S_IFLNK;
1562                 break;
1563         }
1564
1565         ret = kernfs_setattr(kn, &iattr);
1566         kernfs_put(kn);
1567         return ret;
1568 }
1569
1570 /**
1571  * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
1572  * @r: The resource group with which the file is associated.
1573  * @name: Name of the file
1574  * @mask: Mask of permissions that should be restored
1575  *
1576  * Restore the permissions of the named file. If @name is a directory the
1577  * permissions of its parent will be used.
1578  *
1579  * Return: 0 on success, <0 on failure.
1580  */
1581 int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
1582                              umode_t mask)
1583 {
1584         struct iattr iattr = {.ia_valid = ATTR_MODE,};
1585         struct kernfs_node *kn, *parent;
1586         struct rftype *rfts, *rft;
1587         int ret, len;
1588
1589         rfts = res_common_files;
1590         len = ARRAY_SIZE(res_common_files);
1591
1592         for (rft = rfts; rft < rfts + len; rft++) {
1593                 if (!strcmp(rft->name, name))
1594                         iattr.ia_mode = rft->mode & mask;
1595         }
1596
1597         kn = kernfs_find_and_get_ns(r->kn, name, NULL);
1598         if (!kn)
1599                 return -ENOENT;
1600
1601         switch (kernfs_type(kn)) {
1602         case KERNFS_DIR:
1603                 parent = kernfs_get_parent(kn);
1604                 if (parent) {
1605                         iattr.ia_mode |= parent->mode;
1606                         kernfs_put(parent);
1607                 }
1608                 iattr.ia_mode |= S_IFDIR;
1609                 break;
1610         case KERNFS_FILE:
1611                 iattr.ia_mode |= S_IFREG;
1612                 break;
1613         case KERNFS_LINK:
1614                 iattr.ia_mode |= S_IFLNK;
1615                 break;
1616         }
1617
1618         ret = kernfs_setattr(kn, &iattr);
1619         kernfs_put(kn);
1620         return ret;
1621 }
1622
1623 static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
1624                                       unsigned long fflags)
1625 {
1626         struct kernfs_node *kn_subdir;
1627         int ret;
1628
1629         kn_subdir = kernfs_create_dir(kn_info, name,
1630                                       kn_info->mode, r);
1631         if (IS_ERR(kn_subdir))
1632                 return PTR_ERR(kn_subdir);
1633
1634         ret = rdtgroup_kn_set_ugid(kn_subdir);
1635         if (ret)
1636                 return ret;
1637
1638         ret = rdtgroup_add_files(kn_subdir, fflags);
1639         if (!ret)
1640                 kernfs_activate(kn_subdir);
1641
1642         return ret;
1643 }
1644
1645 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
1646 {
1647         struct rdt_resource *r;
1648         unsigned long fflags;
1649         char name[32];
1650         int ret;
1651
1652         /* create the directory */
1653         kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
1654         if (IS_ERR(kn_info))
1655                 return PTR_ERR(kn_info);
1656
1657         ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
1658         if (ret)
1659                 goto out_destroy;
1660
1661         for_each_alloc_enabled_rdt_resource(r) {
1662                 fflags =  r->fflags | RF_CTRL_INFO;
1663                 ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
1664                 if (ret)
1665                         goto out_destroy;
1666         }
1667
1668         for_each_mon_enabled_rdt_resource(r) {
1669                 fflags =  r->fflags | RF_MON_INFO;
1670                 sprintf(name, "%s_MON", r->name);
1671                 ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
1672                 if (ret)
1673                         goto out_destroy;
1674         }
1675
1676         ret = rdtgroup_kn_set_ugid(kn_info);
1677         if (ret)
1678                 goto out_destroy;
1679
1680         kernfs_activate(kn_info);
1681
1682         return 0;
1683
1684 out_destroy:
1685         kernfs_remove(kn_info);
1686         return ret;
1687 }
1688
1689 static int
1690 mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
1691                     char *name, struct kernfs_node **dest_kn)
1692 {
1693         struct kernfs_node *kn;
1694         int ret;
1695
1696         /* create the directory */
1697         kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
1698         if (IS_ERR(kn))
1699                 return PTR_ERR(kn);
1700
1701         if (dest_kn)
1702                 *dest_kn = kn;
1703
1704         ret = rdtgroup_kn_set_ugid(kn);
1705         if (ret)
1706                 goto out_destroy;
1707
1708         kernfs_activate(kn);
1709
1710         return 0;
1711
1712 out_destroy:
1713         kernfs_remove(kn);
1714         return ret;
1715 }
1716
1717 static void l3_qos_cfg_update(void *arg)
1718 {
1719         bool *enable = arg;
1720
1721         wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
1722 }
1723
1724 static void l2_qos_cfg_update(void *arg)
1725 {
1726         bool *enable = arg;
1727
1728         wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
1729 }
1730
1731 static inline bool is_mba_linear(void)
1732 {
1733         return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
1734 }
1735
1736 static int set_cache_qos_cfg(int level, bool enable)
1737 {
1738         void (*update)(void *arg);
1739         struct rdt_resource *r_l;
1740         cpumask_var_t cpu_mask;
1741         struct rdt_domain *d;
1742         int cpu;
1743
1744         if (level == RDT_RESOURCE_L3)
1745                 update = l3_qos_cfg_update;
1746         else if (level == RDT_RESOURCE_L2)
1747                 update = l2_qos_cfg_update;
1748         else
1749                 return -EINVAL;
1750
1751         if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
1752                 return -ENOMEM;
1753
1754         r_l = &rdt_resources_all[level];
1755         list_for_each_entry(d, &r_l->domains, list) {
1756                 /* Pick one CPU from each domain instance to update MSR */
1757                 cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
1758         }
1759         cpu = get_cpu();
1760         /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
1761         if (cpumask_test_cpu(cpu, cpu_mask))
1762                 update(&enable);
1763         /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
1764         smp_call_function_many(cpu_mask, update, &enable, 1);
1765         put_cpu();
1766
1767         free_cpumask_var(cpu_mask);
1768
1769         return 0;
1770 }
1771
1772 /* Restore the qos cfg state when a domain comes online */
1773 void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
1774 {
1775         if (!r->alloc_capable)
1776                 return;
1777
1778         if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA])
1779                 l2_qos_cfg_update(&r->alloc_enabled);
1780
1781         if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA])
1782                 l3_qos_cfg_update(&r->alloc_enabled);
1783 }
1784
1785 /*
1786  * Enable or disable the MBA software controller
1787  * which helps user specify bandwidth in MBps.
1788  * MBA software controller is supported only if
1789  * MBM is supported and MBA is in linear scale.
1790  */
1791 static int set_mba_sc(bool mba_sc)
1792 {
1793         struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
1794         struct rdt_domain *d;
1795
1796         if (!is_mbm_enabled() || !is_mba_linear() ||
1797             mba_sc == is_mba_sc(r))
1798                 return -EINVAL;
1799
1800         r->membw.mba_sc = mba_sc;
1801         list_for_each_entry(d, &r->domains, list)
1802                 setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
1803
1804         return 0;
1805 }
1806
1807 static int cdp_enable(int level, int data_type, int code_type)
1808 {
1809         struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
1810         struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
1811         struct rdt_resource *r_l = &rdt_resources_all[level];
1812         int ret;
1813
1814         if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
1815             !r_lcode->alloc_capable)
1816                 return -EINVAL;
1817
1818         ret = set_cache_qos_cfg(level, true);
1819         if (!ret) {
1820                 r_l->alloc_enabled = false;
1821                 r_ldata->alloc_enabled = true;
1822                 r_lcode->alloc_enabled = true;
1823         }
1824         return ret;
1825 }
1826
1827 static int cdpl3_enable(void)
1828 {
1829         return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
1830                           RDT_RESOURCE_L3CODE);
1831 }
1832
1833 static int cdpl2_enable(void)
1834 {
1835         return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
1836                           RDT_RESOURCE_L2CODE);
1837 }
1838
1839 static void cdp_disable(int level, int data_type, int code_type)
1840 {
1841         struct rdt_resource *r = &rdt_resources_all[level];
1842
1843         r->alloc_enabled = r->alloc_capable;
1844
1845         if (rdt_resources_all[data_type].alloc_enabled) {
1846                 rdt_resources_all[data_type].alloc_enabled = false;
1847                 rdt_resources_all[code_type].alloc_enabled = false;
1848                 set_cache_qos_cfg(level, false);
1849         }
1850 }
1851
1852 static void cdpl3_disable(void)
1853 {
1854         cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
1855 }
1856
1857 static void cdpl2_disable(void)
1858 {
1859         cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
1860 }
1861
1862 static void cdp_disable_all(void)
1863 {
1864         if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
1865                 cdpl3_disable();
1866         if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
1867                 cdpl2_disable();
1868 }
1869
1870 static int parse_rdtgroupfs_options(char *data)
1871 {
1872         char *token, *o = data;
1873         int ret = 0;
1874
1875         while ((token = strsep(&o, ",")) != NULL) {
1876                 if (!*token) {
1877                         ret = -EINVAL;
1878                         goto out;
1879                 }
1880
1881                 if (!strcmp(token, "cdp")) {
1882                         ret = cdpl3_enable();
1883                         if (ret)
1884                                 goto out;
1885                 } else if (!strcmp(token, "cdpl2")) {
1886                         ret = cdpl2_enable();
1887                         if (ret)
1888                                 goto out;
1889                 } else if (!strcmp(token, "mba_MBps")) {
1890                         ret = set_mba_sc(true);
1891                         if (ret)
1892                                 goto out;
1893                 } else {
1894                         ret = -EINVAL;
1895                         goto out;
1896                 }
1897         }
1898
1899         return 0;
1900
1901 out:
1902         pr_err("Invalid mount option \"%s\"\n", token);
1903
1904         return ret;
1905 }
1906
1907 /*
1908  * We don't allow rdtgroup directories to be created anywhere
1909  * except the root directory. Thus when looking for the rdtgroup
1910  * structure for a kernfs node we are either looking at a directory,
1911  * in which case the rdtgroup structure is pointed at by the "priv"
1912  * field, otherwise we have a file, and need only look to the parent
1913  * to find the rdtgroup.
1914  */
1915 static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
1916 {
1917         if (kernfs_type(kn) == KERNFS_DIR) {
1918                 /*
1919                  * All the resource directories use "kn->priv"
1920                  * to point to the "struct rdtgroup" for the
1921                  * resource. "info" and its subdirectories don't
1922                  * have rdtgroup structures, so return NULL here.
1923                  */
1924                 if (kn == kn_info || kn->parent == kn_info)
1925                         return NULL;
1926                 else
1927                         return kn->priv;
1928         } else {
1929                 return kn->parent->priv;
1930         }
1931 }
1932
1933 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
1934 {
1935         struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
1936
1937         if (!rdtgrp)
1938                 return NULL;
1939
1940         atomic_inc(&rdtgrp->waitcount);
1941         kernfs_break_active_protection(kn);
1942
1943         mutex_lock(&rdtgroup_mutex);
1944
1945         /* Was this group deleted while we waited? */
1946         if (rdtgrp->flags & RDT_DELETED)
1947                 return NULL;
1948
1949         return rdtgrp;
1950 }
1951
1952 void rdtgroup_kn_unlock(struct kernfs_node *kn)
1953 {
1954         struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
1955
1956         if (!rdtgrp)
1957                 return;
1958
1959         mutex_unlock(&rdtgroup_mutex);
1960
1961         if (atomic_dec_and_test(&rdtgrp->waitcount) &&
1962             (rdtgrp->flags & RDT_DELETED)) {
1963                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
1964                     rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
1965                         rdtgroup_pseudo_lock_remove(rdtgrp);
1966                 kernfs_unbreak_active_protection(kn);
1967                 rdtgroup_remove(rdtgrp);
1968         } else {
1969                 kernfs_unbreak_active_protection(kn);
1970         }
1971 }
1972
1973 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
1974                              struct rdtgroup *prgrp,
1975                              struct kernfs_node **mon_data_kn);
1976
1977 static struct dentry *rdt_mount(struct file_system_type *fs_type,
1978                                 int flags, const char *unused_dev_name,
1979                                 void *data)
1980 {
1981         struct rdt_domain *dom;
1982         struct rdt_resource *r;
1983         struct dentry *dentry;
1984         int ret;
1985
1986         cpus_read_lock();
1987         mutex_lock(&rdtgroup_mutex);
1988         /*
1989          * resctrl file system can only be mounted once.
1990          */
1991         if (static_branch_unlikely(&rdt_enable_key)) {
1992                 dentry = ERR_PTR(-EBUSY);
1993                 goto out;
1994         }
1995
1996         ret = parse_rdtgroupfs_options(data);
1997         if (ret) {
1998                 dentry = ERR_PTR(ret);
1999                 goto out_cdp;
2000         }
2001
2002         closid_init();
2003
2004         ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
2005         if (ret) {
2006                 dentry = ERR_PTR(ret);
2007                 goto out_cdp;
2008         }
2009
2010         if (rdt_mon_capable) {
2011                 ret = mongroup_create_dir(rdtgroup_default.kn,
2012                                           &rdtgroup_default, "mon_groups",
2013                                           &kn_mongrp);
2014                 if (ret) {
2015                         dentry = ERR_PTR(ret);
2016                         goto out_info;
2017                 }
2018
2019                 ret = mkdir_mondata_all(rdtgroup_default.kn,
2020                                         &rdtgroup_default, &kn_mondata);
2021                 if (ret) {
2022                         dentry = ERR_PTR(ret);
2023                         goto out_mongrp;
2024                 }
2025                 rdtgroup_default.mon.mon_data_kn = kn_mondata;
2026         }
2027
2028         ret = rdt_pseudo_lock_init();
2029         if (ret) {
2030                 dentry = ERR_PTR(ret);
2031                 goto out_mondata;
2032         }
2033
2034         dentry = kernfs_mount(fs_type, flags, rdt_root,
2035                               RDTGROUP_SUPER_MAGIC, NULL);
2036         if (IS_ERR(dentry))
2037                 goto out_psl;
2038
2039         if (rdt_alloc_capable)
2040                 static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
2041         if (rdt_mon_capable)
2042                 static_branch_enable_cpuslocked(&rdt_mon_enable_key);
2043
2044         if (rdt_alloc_capable || rdt_mon_capable)
2045                 static_branch_enable_cpuslocked(&rdt_enable_key);
2046
2047         if (is_mbm_enabled()) {
2048                 r = &rdt_resources_all[RDT_RESOURCE_L3];
2049                 list_for_each_entry(dom, &r->domains, list)
2050                         mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
2051         }
2052
2053         goto out;
2054
2055 out_psl:
2056         rdt_pseudo_lock_release();
2057 out_mondata:
2058         if (rdt_mon_capable)
2059                 kernfs_remove(kn_mondata);
2060 out_mongrp:
2061         if (rdt_mon_capable)
2062                 kernfs_remove(kn_mongrp);
2063 out_info:
2064         kernfs_remove(kn_info);
2065 out_cdp:
2066         cdp_disable_all();
2067 out:
2068         rdt_last_cmd_clear();
2069         mutex_unlock(&rdtgroup_mutex);
2070         cpus_read_unlock();
2071
2072         return dentry;
2073 }
2074
2075 static int reset_all_ctrls(struct rdt_resource *r)
2076 {
2077         struct msr_param msr_param;
2078         cpumask_var_t cpu_mask;
2079         struct rdt_domain *d;
2080         int i, cpu;
2081
2082         if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
2083                 return -ENOMEM;
2084
2085         msr_param.res = r;
2086         msr_param.low = 0;
2087         msr_param.high = r->num_closid;
2088
2089         /*
2090          * Disable resource control for this resource by setting all
2091          * CBMs in all domains to the maximum mask value. Pick one CPU
2092          * from each domain to update the MSRs below.
2093          */
2094         list_for_each_entry(d, &r->domains, list) {
2095                 cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
2096
2097                 for (i = 0; i < r->num_closid; i++)
2098                         d->ctrl_val[i] = r->default_ctrl;
2099         }
2100         cpu = get_cpu();
2101         /* Update CBM on this cpu if it's in cpu_mask. */
2102         if (cpumask_test_cpu(cpu, cpu_mask))
2103                 rdt_ctrl_update(&msr_param);
2104         /* Update CBM on all other cpus in cpu_mask. */
2105         smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
2106         put_cpu();
2107
2108         free_cpumask_var(cpu_mask);
2109
2110         return 0;
2111 }
2112
2113 static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
2114 {
2115         return (rdt_alloc_capable &&
2116                 (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
2117 }
2118
2119 static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
2120 {
2121         return (rdt_mon_capable &&
2122                 (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
2123 }
2124
2125 /*
2126  * Move tasks from one to the other group. If @from is NULL, then all tasks
2127  * in the systems are moved unconditionally (used for teardown).
2128  *
2129  * If @mask is not NULL the cpus on which moved tasks are running are set
2130  * in that mask so the update smp function call is restricted to affected
2131  * cpus.
2132  */
2133 static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
2134                                  struct cpumask *mask)
2135 {
2136         struct task_struct *p, *t;
2137
2138         read_lock(&tasklist_lock);
2139         for_each_process_thread(p, t) {
2140                 if (!from || is_closid_match(t, from) ||
2141                     is_rmid_match(t, from)) {
2142                         t->closid = to->closid;
2143                         t->rmid = to->mon.rmid;
2144
2145                         /*
2146                          * Order the closid/rmid stores above before the loads
2147                          * in task_curr(). This pairs with the full barrier
2148                          * between the rq->curr update and resctrl_sched_in()
2149                          * during context switch.
2150                          */
2151                         smp_mb();
2152
2153                         /*
2154                          * If the task is on a CPU, set the CPU in the mask.
2155                          * The detection is inaccurate as tasks might move or
2156                          * schedule before the smp function call takes place.
2157                          * In such a case the function call is pointless, but
2158                          * there is no other side effect.
2159                          */
2160                         if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
2161                                 cpumask_set_cpu(task_cpu(t), mask);
2162                 }
2163         }
2164         read_unlock(&tasklist_lock);
2165 }
2166
2167 static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
2168 {
2169         struct rdtgroup *sentry, *stmp;
2170         struct list_head *head;
2171
2172         head = &rdtgrp->mon.crdtgrp_list;
2173         list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
2174                 free_rmid(sentry->mon.rmid);
2175                 list_del(&sentry->mon.crdtgrp_list);
2176
2177                 if (atomic_read(&sentry->waitcount) != 0)
2178                         sentry->flags = RDT_DELETED;
2179                 else
2180                         rdtgroup_remove(sentry);
2181         }
2182 }
2183
2184 /*
2185  * Forcibly remove all of subdirectories under root.
2186  */
2187 static void rmdir_all_sub(void)
2188 {
2189         struct rdtgroup *rdtgrp, *tmp;
2190
2191         /* Move all tasks to the default resource group */
2192         rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
2193
2194         list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
2195                 /* Free any child rmids */
2196                 free_all_child_rdtgrp(rdtgrp);
2197
2198                 /* Remove each rdtgroup other than root */
2199                 if (rdtgrp == &rdtgroup_default)
2200                         continue;
2201
2202                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2203                     rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2204                         rdtgroup_pseudo_lock_remove(rdtgrp);
2205
2206                 /*
2207                  * Give any CPUs back to the default group. We cannot copy
2208                  * cpu_online_mask because a CPU might have executed the
2209                  * offline callback already, but is still marked online.
2210                  */
2211                 cpumask_or(&rdtgroup_default.cpu_mask,
2212                            &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2213
2214                 free_rmid(rdtgrp->mon.rmid);
2215
2216                 kernfs_remove(rdtgrp->kn);
2217                 list_del(&rdtgrp->rdtgroup_list);
2218
2219                 if (atomic_read(&rdtgrp->waitcount) != 0)
2220                         rdtgrp->flags = RDT_DELETED;
2221                 else
2222                         rdtgroup_remove(rdtgrp);
2223         }
2224         /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
2225         update_closid_rmid(cpu_online_mask, &rdtgroup_default);
2226
2227         kernfs_remove(kn_info);
2228         kernfs_remove(kn_mongrp);
2229         kernfs_remove(kn_mondata);
2230 }
2231
2232 static void rdt_kill_sb(struct super_block *sb)
2233 {
2234         struct rdt_resource *r;
2235
2236         cpus_read_lock();
2237         mutex_lock(&rdtgroup_mutex);
2238
2239         set_mba_sc(false);
2240
2241         /*Put everything back to default values. */
2242         for_each_alloc_enabled_rdt_resource(r)
2243                 reset_all_ctrls(r);
2244         cdp_disable_all();
2245         rmdir_all_sub();
2246         rdt_pseudo_lock_release();
2247         rdtgroup_default.mode = RDT_MODE_SHAREABLE;
2248         static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
2249         static_branch_disable_cpuslocked(&rdt_mon_enable_key);
2250         static_branch_disable_cpuslocked(&rdt_enable_key);
2251         kernfs_kill_sb(sb);
2252         mutex_unlock(&rdtgroup_mutex);
2253         cpus_read_unlock();
2254 }
2255
2256 static struct file_system_type rdt_fs_type = {
2257         .name    = "resctrl",
2258         .mount   = rdt_mount,
2259         .kill_sb = rdt_kill_sb,
2260 };
2261
2262 static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
2263                        void *priv)
2264 {
2265         struct kernfs_node *kn;
2266         int ret = 0;
2267
2268         kn = __kernfs_create_file(parent_kn, name, 0444,
2269                                   GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
2270                                   &kf_mondata_ops, priv, NULL, NULL);
2271         if (IS_ERR(kn))
2272                 return PTR_ERR(kn);
2273
2274         ret = rdtgroup_kn_set_ugid(kn);
2275         if (ret) {
2276                 kernfs_remove(kn);
2277                 return ret;
2278         }
2279
2280         return ret;
2281 }
2282
2283 /*
2284  * Remove all subdirectories of mon_data of ctrl_mon groups
2285  * and monitor groups with given domain id.
2286  */
2287 void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
2288 {
2289         struct rdtgroup *prgrp, *crgrp;
2290         char name[32];
2291
2292         if (!r->mon_enabled)
2293                 return;
2294
2295         list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
2296                 sprintf(name, "mon_%s_%02d", r->name, dom_id);
2297                 kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
2298
2299                 list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
2300                         kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
2301         }
2302 }
2303
2304 static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
2305                                 struct rdt_domain *d,
2306                                 struct rdt_resource *r, struct rdtgroup *prgrp)
2307 {
2308         union mon_data_bits priv;
2309         struct kernfs_node *kn;
2310         struct mon_evt *mevt;
2311         struct rmid_read rr;
2312         char name[32];
2313         int ret;
2314
2315         sprintf(name, "mon_%s_%02d", r->name, d->id);
2316         /* create the directory */
2317         kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
2318         if (IS_ERR(kn))
2319                 return PTR_ERR(kn);
2320
2321         ret = rdtgroup_kn_set_ugid(kn);
2322         if (ret)
2323                 goto out_destroy;
2324
2325         if (WARN_ON(list_empty(&r->evt_list))) {
2326                 ret = -EPERM;
2327                 goto out_destroy;
2328         }
2329
2330         priv.u.rid = r->rid;
2331         priv.u.domid = d->id;
2332         list_for_each_entry(mevt, &r->evt_list, list) {
2333                 priv.u.evtid = mevt->evtid;
2334                 ret = mon_addfile(kn, mevt->name, priv.priv);
2335                 if (ret)
2336                         goto out_destroy;
2337
2338                 if (is_mbm_event(mevt->evtid))
2339                         mon_event_read(&rr, d, prgrp, mevt->evtid, true);
2340         }
2341         kernfs_activate(kn);
2342         return 0;
2343
2344 out_destroy:
2345         kernfs_remove(kn);
2346         return ret;
2347 }
2348
2349 /*
2350  * Add all subdirectories of mon_data for "ctrl_mon" groups
2351  * and "monitor" groups with given domain id.
2352  */
2353 void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
2354                                     struct rdt_domain *d)
2355 {
2356         struct kernfs_node *parent_kn;
2357         struct rdtgroup *prgrp, *crgrp;
2358         struct list_head *head;
2359
2360         if (!r->mon_enabled)
2361                 return;
2362
2363         list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
2364                 parent_kn = prgrp->mon.mon_data_kn;
2365                 mkdir_mondata_subdir(parent_kn, d, r, prgrp);
2366
2367                 head = &prgrp->mon.crdtgrp_list;
2368                 list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
2369                         parent_kn = crgrp->mon.mon_data_kn;
2370                         mkdir_mondata_subdir(parent_kn, d, r, crgrp);
2371                 }
2372         }
2373 }
2374
2375 static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
2376                                        struct rdt_resource *r,
2377                                        struct rdtgroup *prgrp)
2378 {
2379         struct rdt_domain *dom;
2380         int ret;
2381
2382         list_for_each_entry(dom, &r->domains, list) {
2383                 ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
2384                 if (ret)
2385                         return ret;
2386         }
2387
2388         return 0;
2389 }
2390
2391 /*
2392  * This creates a directory mon_data which contains the monitored data.
2393  *
2394  * mon_data has one directory for each domain whic are named
2395  * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
2396  * with L3 domain looks as below:
2397  * ./mon_data:
2398  * mon_L3_00
2399  * mon_L3_01
2400  * mon_L3_02
2401  * ...
2402  *
2403  * Each domain directory has one file per event:
2404  * ./mon_L3_00/:
2405  * llc_occupancy
2406  *
2407  */
2408 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
2409                              struct rdtgroup *prgrp,
2410                              struct kernfs_node **dest_kn)
2411 {
2412         struct rdt_resource *r;
2413         struct kernfs_node *kn;
2414         int ret;
2415
2416         /*
2417          * Create the mon_data directory first.
2418          */
2419         ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
2420         if (ret)
2421                 return ret;
2422
2423         if (dest_kn)
2424                 *dest_kn = kn;
2425
2426         /*
2427          * Create the subdirectories for each domain. Note that all events
2428          * in a domain like L3 are grouped into a resource whose domain is L3
2429          */
2430         for_each_mon_enabled_rdt_resource(r) {
2431                 ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
2432                 if (ret)
2433                         goto out_destroy;
2434         }
2435
2436         return 0;
2437
2438 out_destroy:
2439         kernfs_remove(kn);
2440         return ret;
2441 }
2442
2443 /**
2444  * cbm_ensure_valid - Enforce validity on provided CBM
2445  * @_val:       Candidate CBM
2446  * @r:          RDT resource to which the CBM belongs
2447  *
2448  * The provided CBM represents all cache portions available for use. This
2449  * may be represented by a bitmap that does not consist of contiguous ones
2450  * and thus be an invalid CBM.
2451  * Here the provided CBM is forced to be a valid CBM by only considering
2452  * the first set of contiguous bits as valid and clearing all bits.
2453  * The intention here is to provide a valid default CBM with which a new
2454  * resource group is initialized. The user can follow this with a
2455  * modification to the CBM if the default does not satisfy the
2456  * requirements.
2457  */
2458 static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r)
2459 {
2460         unsigned long val = *_val;
2461         unsigned int cbm_len = r->cache.cbm_len;
2462         unsigned long first_bit, zero_bit;
2463
2464         if (val == 0)
2465                 return;
2466
2467         first_bit = find_first_bit(&val, cbm_len);
2468         zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
2469
2470         /* Clear any remaining bits to ensure contiguous region */
2471         bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
2472         *_val = (u32)val;
2473 }
2474
2475 /**
2476  * rdtgroup_init_alloc - Initialize the new RDT group's allocations
2477  *
2478  * A new RDT group is being created on an allocation capable (CAT)
2479  * supporting system. Set this group up to start off with all usable
2480  * allocations. That is, all shareable and unused bits.
2481  *
2482  * All-zero CBM is invalid. If there are no more shareable bits available
2483  * on any domain then the entire allocation will fail.
2484  */
2485 static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
2486 {
2487         u32 used_b = 0, unused_b = 0;
2488         u32 closid = rdtgrp->closid;
2489         struct rdt_resource *r;
2490         unsigned long tmp_cbm;
2491         enum rdtgrp_mode mode;
2492         struct rdt_domain *d;
2493         int i, ret;
2494         u32 *ctrl;
2495
2496         for_each_alloc_enabled_rdt_resource(r) {
2497                 /*
2498                  * Only initialize default allocations for CBM cache
2499                  * resources
2500                  */
2501                 if (r->rid == RDT_RESOURCE_MBA)
2502                         continue;
2503                 list_for_each_entry(d, &r->domains, list) {
2504                         d->have_new_ctrl = false;
2505                         d->new_ctrl = r->cache.shareable_bits;
2506                         used_b = r->cache.shareable_bits;
2507                         ctrl = d->ctrl_val;
2508                         for (i = 0; i < closids_supported(); i++, ctrl++) {
2509                                 if (closid_allocated(i) && i != closid) {
2510                                         mode = rdtgroup_mode_by_closid(i);
2511                                         if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
2512                                                 continue;
2513                                         used_b |= *ctrl;
2514                                         if (mode == RDT_MODE_SHAREABLE)
2515                                                 d->new_ctrl |= *ctrl;
2516                                 }
2517                         }
2518                         if (d->plr && d->plr->cbm > 0)
2519                                 used_b |= d->plr->cbm;
2520                         unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
2521                         unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
2522                         d->new_ctrl |= unused_b;
2523                         /*
2524                          * Force the initial CBM to be valid, user can
2525                          * modify the CBM based on system availability.
2526                          */
2527                         cbm_ensure_valid(&d->new_ctrl, r);
2528                         /*
2529                          * Assign the u32 CBM to an unsigned long to ensure
2530                          * that bitmap_weight() does not access out-of-bound
2531                          * memory.
2532                          */
2533                         tmp_cbm = d->new_ctrl;
2534                         if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) <
2535                             r->cache.min_cbm_bits) {
2536                                 rdt_last_cmd_printf("no space on %s:%d\n",
2537                                                     r->name, d->id);
2538                                 return -ENOSPC;
2539                         }
2540                         d->have_new_ctrl = true;
2541                 }
2542         }
2543
2544         for_each_alloc_enabled_rdt_resource(r) {
2545                 /*
2546                  * Only initialize default allocations for CBM cache
2547                  * resources
2548                  */
2549                 if (r->rid == RDT_RESOURCE_MBA)
2550                         continue;
2551                 ret = update_domains(r, rdtgrp->closid);
2552                 if (ret < 0) {
2553                         rdt_last_cmd_puts("failed to initialize allocations\n");
2554                         return ret;
2555                 }
2556                 rdtgrp->mode = RDT_MODE_SHAREABLE;
2557         }
2558
2559         return 0;
2560 }
2561
2562 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
2563                              struct kernfs_node *prgrp_kn,
2564                              const char *name, umode_t mode,
2565                              enum rdt_group_type rtype, struct rdtgroup **r)
2566 {
2567         struct rdtgroup *prdtgrp, *rdtgrp;
2568         struct kernfs_node *kn;
2569         uint files = 0;
2570         int ret;
2571
2572         prdtgrp = rdtgroup_kn_lock_live(parent_kn);
2573         rdt_last_cmd_clear();
2574         if (!prdtgrp) {
2575                 ret = -ENODEV;
2576                 rdt_last_cmd_puts("directory was removed\n");
2577                 goto out_unlock;
2578         }
2579
2580         if (rtype == RDTMON_GROUP &&
2581             (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2582              prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
2583                 ret = -EINVAL;
2584                 rdt_last_cmd_puts("pseudo-locking in progress\n");
2585                 goto out_unlock;
2586         }
2587
2588         /* allocate the rdtgroup. */
2589         rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
2590         if (!rdtgrp) {
2591                 ret = -ENOSPC;
2592                 rdt_last_cmd_puts("kernel out of memory\n");
2593                 goto out_unlock;
2594         }
2595         *r = rdtgrp;
2596         rdtgrp->mon.parent = prdtgrp;
2597         rdtgrp->type = rtype;
2598         INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
2599
2600         /* kernfs creates the directory for rdtgrp */
2601         kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
2602         if (IS_ERR(kn)) {
2603                 ret = PTR_ERR(kn);
2604                 rdt_last_cmd_puts("kernfs create error\n");
2605                 goto out_free_rgrp;
2606         }
2607         rdtgrp->kn = kn;
2608
2609         /*
2610          * kernfs_remove() will drop the reference count on "kn" which
2611          * will free it. But we still need it to stick around for the
2612          * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
2613          * which will be dropped by kernfs_put() in rdtgroup_remove().
2614          */
2615         kernfs_get(kn);
2616
2617         ret = rdtgroup_kn_set_ugid(kn);
2618         if (ret) {
2619                 rdt_last_cmd_puts("kernfs perm error\n");
2620                 goto out_destroy;
2621         }
2622
2623         files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
2624         ret = rdtgroup_add_files(kn, files);
2625         if (ret) {
2626                 rdt_last_cmd_puts("kernfs fill error\n");
2627                 goto out_destroy;
2628         }
2629
2630         if (rdt_mon_capable) {
2631                 ret = alloc_rmid();
2632                 if (ret < 0) {
2633                         rdt_last_cmd_puts("out of RMIDs\n");
2634                         goto out_destroy;
2635                 }
2636                 rdtgrp->mon.rmid = ret;
2637
2638                 ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
2639                 if (ret) {
2640                         rdt_last_cmd_puts("kernfs subdir error\n");
2641                         goto out_idfree;
2642                 }
2643         }
2644         kernfs_activate(kn);
2645
2646         /*
2647          * The caller unlocks the parent_kn upon success.
2648          */
2649         return 0;
2650
2651 out_idfree:
2652         free_rmid(rdtgrp->mon.rmid);
2653 out_destroy:
2654         kernfs_put(rdtgrp->kn);
2655         kernfs_remove(rdtgrp->kn);
2656 out_free_rgrp:
2657         kfree(rdtgrp);
2658 out_unlock:
2659         rdtgroup_kn_unlock(parent_kn);
2660         return ret;
2661 }
2662
2663 static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
2664 {
2665         kernfs_remove(rgrp->kn);
2666         free_rmid(rgrp->mon.rmid);
2667         rdtgroup_remove(rgrp);
2668 }
2669
2670 /*
2671  * Create a monitor group under "mon_groups" directory of a control
2672  * and monitor group(ctrl_mon). This is a resource group
2673  * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
2674  */
2675 static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
2676                               struct kernfs_node *prgrp_kn,
2677                               const char *name,
2678                               umode_t mode)
2679 {
2680         struct rdtgroup *rdtgrp, *prgrp;
2681         int ret;
2682
2683         ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
2684                                 &rdtgrp);
2685         if (ret)
2686                 return ret;
2687
2688         prgrp = rdtgrp->mon.parent;
2689         rdtgrp->closid = prgrp->closid;
2690
2691         /*
2692          * Add the rdtgrp to the list of rdtgrps the parent
2693          * ctrl_mon group has to track.
2694          */
2695         list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
2696
2697         rdtgroup_kn_unlock(parent_kn);
2698         return ret;
2699 }
2700
2701 /*
2702  * These are rdtgroups created under the root directory. Can be used
2703  * to allocate and monitor resources.
2704  */
2705 static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
2706                                    struct kernfs_node *prgrp_kn,
2707                                    const char *name, umode_t mode)
2708 {
2709         struct rdtgroup *rdtgrp;
2710         struct kernfs_node *kn;
2711         u32 closid;
2712         int ret;
2713
2714         ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
2715                                 &rdtgrp);
2716         if (ret)
2717                 return ret;
2718
2719         kn = rdtgrp->kn;
2720         ret = closid_alloc();
2721         if (ret < 0) {
2722                 rdt_last_cmd_puts("out of CLOSIDs\n");
2723                 goto out_common_fail;
2724         }
2725         closid = ret;
2726         ret = 0;
2727
2728         rdtgrp->closid = closid;
2729         ret = rdtgroup_init_alloc(rdtgrp);
2730         if (ret < 0)
2731                 goto out_id_free;
2732
2733         list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
2734
2735         if (rdt_mon_capable) {
2736                 /*
2737                  * Create an empty mon_groups directory to hold the subset
2738                  * of tasks and cpus to monitor.
2739                  */
2740                 ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
2741                 if (ret) {
2742                         rdt_last_cmd_puts("kernfs subdir error\n");
2743                         goto out_del_list;
2744                 }
2745         }
2746
2747         goto out_unlock;
2748
2749 out_del_list:
2750         list_del(&rdtgrp->rdtgroup_list);
2751 out_id_free:
2752         closid_free(closid);
2753 out_common_fail:
2754         mkdir_rdt_prepare_clean(rdtgrp);
2755 out_unlock:
2756         rdtgroup_kn_unlock(parent_kn);
2757         return ret;
2758 }
2759
2760 /*
2761  * We allow creating mon groups only with in a directory called "mon_groups"
2762  * which is present in every ctrl_mon group. Check if this is a valid
2763  * "mon_groups" directory.
2764  *
2765  * 1. The directory should be named "mon_groups".
2766  * 2. The mon group itself should "not" be named "mon_groups".
2767  *   This makes sure "mon_groups" directory always has a ctrl_mon group
2768  *   as parent.
2769  */
2770 static bool is_mon_groups(struct kernfs_node *kn, const char *name)
2771 {
2772         return (!strcmp(kn->name, "mon_groups") &&
2773                 strcmp(name, "mon_groups"));
2774 }
2775
2776 static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
2777                           umode_t mode)
2778 {
2779         /* Do not accept '\n' to avoid unparsable situation. */
2780         if (strchr(name, '\n'))
2781                 return -EINVAL;
2782
2783         /*
2784          * If the parent directory is the root directory and RDT
2785          * allocation is supported, add a control and monitoring
2786          * subdirectory
2787          */
2788         if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
2789                 return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
2790
2791         /*
2792          * If RDT monitoring is supported and the parent directory is a valid
2793          * "mon_groups" directory, add a monitoring subdirectory.
2794          */
2795         if (rdt_mon_capable && is_mon_groups(parent_kn, name))
2796                 return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
2797
2798         return -EPERM;
2799 }
2800
2801 static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
2802                               cpumask_var_t tmpmask)
2803 {
2804         struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
2805         int cpu;
2806
2807         /* Give any tasks back to the parent group */
2808         rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
2809
2810         /* Update per cpu rmid of the moved CPUs first */
2811         for_each_cpu(cpu, &rdtgrp->cpu_mask)
2812                 per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
2813         /*
2814          * Update the MSR on moved CPUs and CPUs which have moved
2815          * task running on them.
2816          */
2817         cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
2818         update_closid_rmid(tmpmask, NULL);
2819
2820         rdtgrp->flags = RDT_DELETED;
2821         free_rmid(rdtgrp->mon.rmid);
2822
2823         /*
2824          * Remove the rdtgrp from the parent ctrl_mon group's list
2825          */
2826         WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
2827         list_del(&rdtgrp->mon.crdtgrp_list);
2828
2829         kernfs_remove(rdtgrp->kn);
2830
2831         return 0;
2832 }
2833
2834 static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
2835                                 struct rdtgroup *rdtgrp)
2836 {
2837         rdtgrp->flags = RDT_DELETED;
2838         list_del(&rdtgrp->rdtgroup_list);
2839
2840         kernfs_remove(rdtgrp->kn);
2841         return 0;
2842 }
2843
2844 static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
2845                                cpumask_var_t tmpmask)
2846 {
2847         int cpu;
2848
2849         /* Give any tasks back to the default group */
2850         rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
2851
2852         /* Give any CPUs back to the default group */
2853         cpumask_or(&rdtgroup_default.cpu_mask,
2854                    &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2855
2856         /* Update per cpu closid and rmid of the moved CPUs first */
2857         for_each_cpu(cpu, &rdtgrp->cpu_mask) {
2858                 per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
2859                 per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
2860         }
2861
2862         /*
2863          * Update the MSR on moved CPUs and CPUs which have moved
2864          * task running on them.
2865          */
2866         cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
2867         update_closid_rmid(tmpmask, NULL);
2868
2869         closid_free(rdtgrp->closid);
2870         free_rmid(rdtgrp->mon.rmid);
2871
2872         rdtgroup_ctrl_remove(kn, rdtgrp);
2873
2874         /*
2875          * Free all the child monitor group rmids.
2876          */
2877         free_all_child_rdtgrp(rdtgrp);
2878
2879         return 0;
2880 }
2881
2882 static int rdtgroup_rmdir(struct kernfs_node *kn)
2883 {
2884         struct kernfs_node *parent_kn = kn->parent;
2885         struct rdtgroup *rdtgrp;
2886         cpumask_var_t tmpmask;
2887         int ret = 0;
2888
2889         if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
2890                 return -ENOMEM;
2891
2892         rdtgrp = rdtgroup_kn_lock_live(kn);
2893         if (!rdtgrp) {
2894                 ret = -EPERM;
2895                 goto out;
2896         }
2897
2898         /*
2899          * If the rdtgroup is a ctrl_mon group and parent directory
2900          * is the root directory, remove the ctrl_mon group.
2901          *
2902          * If the rdtgroup is a mon group and parent directory
2903          * is a valid "mon_groups" directory, remove the mon group.
2904          */
2905         if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
2906             rdtgrp != &rdtgroup_default) {
2907                 if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2908                     rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
2909                         ret = rdtgroup_ctrl_remove(kn, rdtgrp);
2910                 } else {
2911                         ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
2912                 }
2913         } else if (rdtgrp->type == RDTMON_GROUP &&
2914                  is_mon_groups(parent_kn, kn->name)) {
2915                 ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
2916         } else {
2917                 ret = -EPERM;
2918         }
2919
2920 out:
2921         rdtgroup_kn_unlock(kn);
2922         free_cpumask_var(tmpmask);
2923         return ret;
2924 }
2925
2926 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
2927 {
2928         if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
2929                 seq_puts(seq, ",cdp");
2930
2931         if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
2932                 seq_puts(seq, ",cdpl2");
2933
2934         if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA]))
2935                 seq_puts(seq, ",mba_MBps");
2936
2937         return 0;
2938 }
2939
2940 static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
2941         .mkdir          = rdtgroup_mkdir,
2942         .rmdir          = rdtgroup_rmdir,
2943         .show_options   = rdtgroup_show_options,
2944 };
2945
2946 static int __init rdtgroup_setup_root(void)
2947 {
2948         int ret;
2949
2950         rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
2951                                       KERNFS_ROOT_CREATE_DEACTIVATED |
2952                                       KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
2953                                       &rdtgroup_default);
2954         if (IS_ERR(rdt_root))
2955                 return PTR_ERR(rdt_root);
2956
2957         mutex_lock(&rdtgroup_mutex);
2958
2959         rdtgroup_default.closid = 0;
2960         rdtgroup_default.mon.rmid = 0;
2961         rdtgroup_default.type = RDTCTRL_GROUP;
2962         INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
2963
2964         list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
2965
2966         ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
2967         if (ret) {
2968                 kernfs_destroy_root(rdt_root);
2969                 goto out;
2970         }
2971
2972         rdtgroup_default.kn = rdt_root->kn;
2973         kernfs_activate(rdtgroup_default.kn);
2974
2975 out:
2976         mutex_unlock(&rdtgroup_mutex);
2977
2978         return ret;
2979 }
2980
2981 /*
2982  * rdtgroup_init - rdtgroup initialization
2983  *
2984  * Setup resctrl file system including set up root, create mount point,
2985  * register rdtgroup filesystem, and initialize files under root directory.
2986  *
2987  * Return: 0 on success or -errno
2988  */
2989 int __init rdtgroup_init(void)
2990 {
2991         int ret = 0;
2992
2993         seq_buf_init(&last_cmd_status, last_cmd_status_buf,
2994                      sizeof(last_cmd_status_buf));
2995
2996         ret = rdtgroup_setup_root();
2997         if (ret)
2998                 return ret;
2999
3000         ret = sysfs_create_mount_point(fs_kobj, "resctrl");
3001         if (ret)
3002                 goto cleanup_root;
3003
3004         ret = register_filesystem(&rdt_fs_type);
3005         if (ret)
3006                 goto cleanup_mountpoint;
3007
3008         /*
3009          * Adding the resctrl debugfs directory here may not be ideal since
3010          * it would let the resctrl debugfs directory appear on the debugfs
3011          * filesystem before the resctrl filesystem is mounted.
3012          * It may also be ok since that would enable debugging of RDT before
3013          * resctrl is mounted.
3014          * The reason why the debugfs directory is created here and not in
3015          * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and
3016          * during the debugfs directory creation also &sb->s_type->i_mutex_key
3017          * (the lockdep class of inode->i_rwsem). Other filesystem
3018          * interactions (eg. SyS_getdents) have the lock ordering:
3019          * &sb->s_type->i_mutex_key --> &mm->mmap_sem
3020          * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex
3021          * is taken, thus creating dependency:
3022          * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause
3023          * issues considering the other two lock dependencies.
3024          * By creating the debugfs directory here we avoid a dependency
3025          * that may cause deadlock (even though file operations cannot
3026          * occur until the filesystem is mounted, but I do not know how to
3027          * tell lockdep that).
3028          */
3029         debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
3030
3031         return 0;
3032
3033 cleanup_mountpoint:
3034         sysfs_remove_mount_point(fs_kobj, "resctrl");
3035 cleanup_root:
3036         kernfs_destroy_root(rdt_root);
3037
3038         return ret;
3039 }
3040
3041 void __exit rdtgroup_exit(void)
3042 {
3043         debugfs_remove_recursive(debugfs_resctrl);
3044         unregister_filesystem(&rdt_fs_type);
3045         sysfs_remove_mount_point(fs_kobj, "resctrl");
3046         kernfs_destroy_root(rdt_root);
3047 }