drivers/infiniband/sw/rdmavt/cq.c

   1 /*
   2  * Copyright(c) 2016 Intel Corporation.
   3  *
   4  * This file is provided under a dual BSD/GPLv2 license.  When using or
   5  * redistributing this file, you may do so under either license.
   6  *
   7  * GPL LICENSE SUMMARY
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of version 2 of the GNU General Public License as
  11  * published by the Free Software Foundation.
  12  *
  13  * This program is distributed in the hope that it will be useful, but
  14  * WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * General Public License for more details.
  17  *
  18  * BSD LICENSE
  19  *
  20  * Redistribution and use in source and binary forms, with or without
  21  * modification, are permitted provided that the following conditions
  22  * are met:
  23  *
  24  *  - Redistributions of source code must retain the above copyright
  25  *    notice, this list of conditions and the following disclaimer.
  26  *  - Redistributions in binary form must reproduce the above copyright
  27  *    notice, this list of conditions and the following disclaimer in
  28  *    the documentation and/or other materials provided with the
  29  *    distribution.
  30  *  - Neither the name of Intel Corporation nor the names of its
  31  *    contributors may be used to endorse or promote products derived
  32  *    from this software without specific prior written permission.
  33  *
  34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45  *
  46  */
  47
  48 #include <linux/slab.h>
  49 #include <linux/vmalloc.h>
  50 #include <linux/kthread.h>
  51 #include "cq.h"
  52 #include "vt.h"
  53 #include "trace.h"
  54
  55 /**
  56  * rvt_cq_enter - add a new entry to the completion queue
  57  * @cq: completion queue
  58  * @entry: work completion entry to add
  59  * @sig: true if @entry is solicited
  60  *
  61  * This may be called with qp->s_lock held.
  62  */
  63 void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
  64 {
  65         struct rvt_cq_wc *wc;
  66         unsigned long flags;
  67         u32 head;
  68         u32 next;
  69
  70         spin_lock_irqsave(&cq->lock, flags);
  71
  72         /*
  73          * Note that the head pointer might be writable by user processes.
  74          * Take care to verify it is a sane value.
  75          */
  76         wc = cq->queue;
  77         head = wc->head;
  78         if (head >= (unsigned)cq->ibcq.cqe) {
  79                 head = cq->ibcq.cqe;
  80                 next = 0;
  81         } else {
  82                 next = head + 1;
  83         }
  84
  85         if (unlikely(next == wc->tail)) {
  86                 spin_unlock_irqrestore(&cq->lock, flags);
  87                 if (cq->ibcq.event_handler) {
  88                         struct ib_event ev;
  89
  90                         ev.device = cq->ibcq.device;
  91                         ev.element.cq = &cq->ibcq;
  92                         ev.event = IB_EVENT_CQ_ERR;
  93                         cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
  94                 }
  95                 return;
  96         }
  97         trace_rvt_cq_enter(cq, entry, head);
  98         if (cq->ip) {
  99                 wc->uqueue[head].wr_id = entry->wr_id;
 100                 wc->uqueue[head].status = entry->status;
 101                 wc->uqueue[head].opcode = entry->opcode;
 102                 wc->uqueue[head].vendor_err = entry->vendor_err;
 103                 wc->uqueue[head].byte_len = entry->byte_len;
 104                 wc->uqueue[head].ex.imm_data =
 105                         (__u32 __force)entry->ex.imm_data;
 106                 wc->uqueue[head].qp_num = entry->qp->qp_num;
 107                 wc->uqueue[head].src_qp = entry->src_qp;
 108                 wc->uqueue[head].wc_flags = entry->wc_flags;
 109                 wc->uqueue[head].pkey_index = entry->pkey_index;
 110                 wc->uqueue[head].slid = ib_lid_cpu16(entry->slid);
 111                 wc->uqueue[head].sl = entry->sl;
 112                 wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
 113                 wc->uqueue[head].port_num = entry->port_num;
 114                 /* Make sure entry is written before the head index. */
 115                 smp_wmb();
 116         } else {
 117                 wc->kqueue[head] = *entry;
 118         }
 119         wc->head = next;
 120
 121         if (cq->notify == IB_CQ_NEXT_COMP ||
 122             (cq->notify == IB_CQ_SOLICITED &&
 123              (solicited || entry->status != IB_WC_SUCCESS))) {
 124                 struct kthread_worker *worker;
 125
 126                 /*
 127                  * This will cause send_complete() to be called in
 128                  * another thread.
 129                  */
 130                 rcu_read_lock();
 131                 worker = rcu_dereference(cq->rdi->worker);
 132                 if (likely(worker)) {
 133                         cq->notify = RVT_CQ_NONE;
 134                         cq->triggered++;
 135                         kthread_queue_work(worker, &cq->comptask);
 136                 }
 137                 rcu_read_unlock();
 138         }
 139
 140         spin_unlock_irqrestore(&cq->lock, flags);
 141 }
 142 EXPORT_SYMBOL(rvt_cq_enter);
 143
 144 static void send_complete(struct kthread_work *work)
 145 {
 146         struct rvt_cq *cq = container_of(work, struct rvt_cq, comptask);
 147
 148         /*
 149          * The completion handler will most likely rearm the notification
 150          * and poll for all pending entries.  If a new completion entry
 151          * is added while we are in this routine, queue_work()
 152          * won't call us again until we return so we check triggered to
 153          * see if we need to call the handler again.
 154          */
 155         for (;;) {
 156                 u8 triggered = cq->triggered;
 157
 158                 /*
 159                  * IPoIB connected mode assumes the callback is from a
 160                  * soft IRQ. We simulate this by blocking "bottom halves".
 161                  * See the implementation for ipoib_cm_handle_tx_wc(),
 162                  * netif_tx_lock_bh() and netif_tx_lock().
 163                  */
 164                 local_bh_disable();
 165                 cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
 166                 local_bh_enable();
 167
 168                 if (cq->triggered == triggered)
 169                         return;
 170         }
 171 }
 172
 173 /**
 174  * rvt_create_cq - create a completion queue
 175  * @ibdev: the device this completion queue is attached to
 176  * @attr: creation attributes
 177  * @context: unused by the QLogic_IB driver
 178  * @udata: user data for libibverbs.so
 179  *
 180  * Called by ib_create_cq() in the generic verbs code.
 181  *
 182  * Return: pointer to the completion queue or negative errno values
 183  * for failure.
 184  */
 185 struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
 186                             const struct ib_cq_init_attr *attr,
 187                             struct ib_ucontext *context,
 188                             struct ib_udata *udata)
 189 {
 190         struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
 191         struct rvt_cq *cq;
 192         struct rvt_cq_wc *wc;
 193         struct ib_cq *ret;
 194         u32 sz;
 195         unsigned int entries = attr->cqe;
 196
 197         if (attr->flags)
 198                 return ERR_PTR(-EINVAL);
 199
 200         if (entries < 1 || entries > rdi->dparms.props.max_cqe)
 201                 return ERR_PTR(-EINVAL);
 202
 203         /* Allocate the completion queue structure. */
 204         cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node);
 205         if (!cq)
 206                 return ERR_PTR(-ENOMEM);
 207
 208         /*
 209          * Allocate the completion queue entries and head/tail pointers.
 210          * This is allocated separately so that it can be resized and
 211          * also mapped into user space.
 212          * We need to use vmalloc() in order to support mmap and large
 213          * numbers of entries.
 214          */
 215         sz = sizeof(*wc);
 216         if (udata && udata->outlen >= sizeof(__u64))
 217                 sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
 218         else
 219                 sz += sizeof(struct ib_wc) * (entries + 1);
 220         wc = udata ?
 221                 vmalloc_user(sz) :
 222                 vzalloc_node(sz, rdi->dparms.node);
 223         if (!wc) {
 224                 ret = ERR_PTR(-ENOMEM);
 225                 goto bail_cq;
 226         }
 227
 228         /*
 229          * Return the address of the WC as the offset to mmap.
 230          * See rvt_mmap() for details.
 231          */
 232         if (udata && udata->outlen >= sizeof(__u64)) {
 233                 int err;
 234
 235                 cq->ip = rvt_create_mmap_info(rdi, sz, context, wc);
 236                 if (!cq->ip) {
 237                         ret = ERR_PTR(-ENOMEM);
 238                         goto bail_wc;
 239                 }
 240
 241                 err = ib_copy_to_udata(udata, &cq->ip->offset,
 242                                        sizeof(cq->ip->offset));
 243                 if (err) {
 244                         ret = ERR_PTR(err);
 245                         goto bail_ip;
 246                 }
 247         }
 248
 249         spin_lock_irq(&rdi->n_cqs_lock);
 250         if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) {
 251                 spin_unlock_irq(&rdi->n_cqs_lock);
 252                 ret = ERR_PTR(-ENOMEM);
 253                 goto bail_ip;
 254         }
 255
 256         rdi->n_cqs_allocated++;
 257         spin_unlock_irq(&rdi->n_cqs_lock);
 258
 259         if (cq->ip) {
 260                 spin_lock_irq(&rdi->pending_lock);
 261                 list_add(&cq->ip->pending_mmaps, &rdi->pending_mmaps);
 262                 spin_unlock_irq(&rdi->pending_lock);
 263         }
 264
 265         /*
 266          * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
 267          * The number of entries should be >= the number requested or return
 268          * an error.
 269          */
 270         cq->rdi = rdi;
 271         cq->ibcq.cqe = entries;
 272         cq->notify = RVT_CQ_NONE;
 273         spin_lock_init(&cq->lock);
 274         kthread_init_work(&cq->comptask, send_complete);
 275         cq->queue = wc;
 276
 277         ret = &cq->ibcq;
 278
 279         goto done;
 280
 281 bail_ip:
 282         kfree(cq->ip);
 283 bail_wc:
 284         vfree(wc);
 285 bail_cq:
 286         kfree(cq);
 287 done:
 288         return ret;
 289 }
 290
 291 /**
 292  * rvt_destroy_cq - destroy a completion queue
 293  * @ibcq: the completion queue to destroy.
 294  *
 295  * Called by ib_destroy_cq() in the generic verbs code.
 296  *
 297  * Return: always 0
 298  */
 299 int rvt_destroy_cq(struct ib_cq *ibcq)
 300 {
 301         struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
 302         struct rvt_dev_info *rdi = cq->rdi;
 303
 304         kthread_flush_work(&cq->comptask);
 305         spin_lock_irq(&rdi->n_cqs_lock);
 306         rdi->n_cqs_allocated--;
 307         spin_unlock_irq(&rdi->n_cqs_lock);
 308         if (cq->ip)
 309                 kref_put(&cq->ip->ref, rvt_release_mmap_info);
 310         else
 311                 vfree(cq->queue);
 312         kfree(cq);
 313
 314         return 0;
 315 }
 316
 317 /**
 318  * rvt_req_notify_cq - change the notification type for a completion queue
 319  * @ibcq: the completion queue
 320  * @notify_flags: the type of notification to request
 321  *
 322  * This may be called from interrupt context.  Also called by
 323  * ib_req_notify_cq() in the generic verbs code.
 324  *
 325  * Return: 0 for success.
 326  */
 327 int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
 328 {
 329         struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
 330         unsigned long flags;
 331         int ret = 0;
 332
 333         spin_lock_irqsave(&cq->lock, flags);
 334         /*
 335          * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
 336          * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
 337          */
 338         if (cq->notify != IB_CQ_NEXT_COMP)
 339                 cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
 340
 341         if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
 342             cq->queue->head != cq->queue->tail)
 343                 ret = 1;
 344
 345         spin_unlock_irqrestore(&cq->lock, flags);
 346
 347         return ret;
 348 }
 349
 350 /**
 351  * rvt_resize_cq - change the size of the CQ
 352  * @ibcq: the completion queue
 353  *
 354  * Return: 0 for success.
 355  */
 356 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 357 {
 358         struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
 359         struct rvt_cq_wc *old_wc;
 360         struct rvt_cq_wc *wc;
 361         u32 head, tail, n;
 362         int ret;
 363         u32 sz;
 364         struct rvt_dev_info *rdi = cq->rdi;
 365
 366         if (cqe < 1 || cqe > rdi->dparms.props.max_cqe)
 367                 return -EINVAL;
 368
 369         /*
 370          * Need to use vmalloc() if we want to support large #s of entries.
 371          */
 372         sz = sizeof(*wc);
 373         if (udata && udata->outlen >= sizeof(__u64))
 374                 sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
 375         else
 376                 sz += sizeof(struct ib_wc) * (cqe + 1);
 377         wc = udata ?
 378                 vmalloc_user(sz) :
 379                 vzalloc_node(sz, rdi->dparms.node);
 380         if (!wc)
 381                 return -ENOMEM;
 382
 383         /* Check that we can write the offset to mmap. */
 384         if (udata && udata->outlen >= sizeof(__u64)) {
 385                 __u64 offset = 0;
 386
 387                 ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
 388                 if (ret)
 389                         goto bail_free;
 390         }
 391
 392         spin_lock_irq(&cq->lock);
 393         /*
 394          * Make sure head and tail are sane since they
 395          * might be user writable.
 396          */
 397         old_wc = cq->queue;
 398         head = old_wc->head;
 399         if (head > (u32)cq->ibcq.cqe)
 400                 head = (u32)cq->ibcq.cqe;
 401         tail = old_wc->tail;
 402         if (tail > (u32)cq->ibcq.cqe)
 403                 tail = (u32)cq->ibcq.cqe;
 404         if (head < tail)
 405                 n = cq->ibcq.cqe + 1 + head - tail;
 406         else
 407                 n = head - tail;
 408         if (unlikely((u32)cqe < n)) {
 409                 ret = -EINVAL;
 410                 goto bail_unlock;
 411         }
 412         for (n = 0; tail != head; n++) {
 413                 if (cq->ip)
 414                         wc->uqueue[n] = old_wc->uqueue[tail];
 415                 else
 416                         wc->kqueue[n] = old_wc->kqueue[tail];
 417                 if (tail == (u32)cq->ibcq.cqe)
 418                         tail = 0;
 419                 else
 420                         tail++;
 421         }
 422         cq->ibcq.cqe = cqe;
 423         wc->head = n;
 424         wc->tail = 0;
 425         cq->queue = wc;
 426         spin_unlock_irq(&cq->lock);
 427
 428         vfree(old_wc);
 429
 430         if (cq->ip) {
 431                 struct rvt_mmap_info *ip = cq->ip;
 432
 433                 rvt_update_mmap_info(rdi, ip, sz, wc);
 434
 435                 /*
 436                  * Return the offset to mmap.
 437                  * See rvt_mmap() for details.
 438                  */
 439                 if (udata && udata->outlen >= sizeof(__u64)) {
 440                         ret = ib_copy_to_udata(udata, &ip->offset,
 441                                                sizeof(ip->offset));
 442                         if (ret)
 443                                 return ret;
 444                 }
 445
 446                 spin_lock_irq(&rdi->pending_lock);
 447                 if (list_empty(&ip->pending_mmaps))
 448                         list_add(&ip->pending_mmaps, &rdi->pending_mmaps);
 449                 spin_unlock_irq(&rdi->pending_lock);
 450         }
 451
 452         return 0;
 453
 454 bail_unlock:
 455         spin_unlock_irq(&cq->lock);
 456 bail_free:
 457         vfree(wc);
 458         return ret;
 459 }
 460
 461 /**
 462  * rvt_poll_cq - poll for work completion entries
 463  * @ibcq: the completion queue to poll
 464  * @num_entries: the maximum number of entries to return
 465  * @entry: pointer to array where work completions are placed
 466  *
 467  * This may be called from interrupt context.  Also called by ib_poll_cq()
 468  * in the generic verbs code.
 469  *
 470  * Return: the number of completion entries polled.
 471  */
 472 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
 473 {
 474         struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
 475         struct rvt_cq_wc *wc;
 476         unsigned long flags;
 477         int npolled;
 478         u32 tail;
 479
 480         /* The kernel can only poll a kernel completion queue */
 481         if (cq->ip)
 482                 return -EINVAL;
 483
 484         spin_lock_irqsave(&cq->lock, flags);
 485
 486         wc = cq->queue;
 487         tail = wc->tail;
 488         if (tail > (u32)cq->ibcq.cqe)
 489                 tail = (u32)cq->ibcq.cqe;
 490         for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
 491                 if (tail == wc->head)
 492                         break;
 493                 /* The kernel doesn't need a RMB since it has the lock. */
 494                 trace_rvt_cq_poll(cq, &wc->kqueue[tail], npolled);
 495                 *entry = wc->kqueue[tail];
 496                 if (tail >= cq->ibcq.cqe)
 497                         tail = 0;
 498                 else
 499                         tail++;
 500         }
 501         wc->tail = tail;
 502
 503         spin_unlock_irqrestore(&cq->lock, flags);
 504
 505         return npolled;
 506 }
 507
 508 /**
 509  * rvt_driver_cq_init - Init cq resources on behalf of driver
 510  * @rdi: rvt dev structure
 511  *
 512  * Return: 0 on success
 513  */
 514 int rvt_driver_cq_init(struct rvt_dev_info *rdi)
 515 {
 516         int cpu;
 517         struct kthread_worker *worker;
 518
 519         if (rcu_access_pointer(rdi->worker))
 520                 return 0;
 521
 522         spin_lock_init(&rdi->n_cqs_lock);
 523
 524         cpu = cpumask_first(cpumask_of_node(rdi->dparms.node));
 525         worker = kthread_create_worker_on_cpu(cpu, 0,
 526                                               "%s", rdi->dparms.cq_name);
 527         if (IS_ERR(worker))
 528                 return PTR_ERR(worker);
 529
 530         set_user_nice(worker->task, MIN_NICE);
 531         RCU_INIT_POINTER(rdi->worker, worker);
 532         return 0;
 533 }
 534
 535 /**
 536  * rvt_cq_exit - tear down cq reources
 537  * @rdi: rvt dev structure
 538  */
 539 void rvt_cq_exit(struct rvt_dev_info *rdi)
 540 {
 541         struct kthread_worker *worker;
 542
 543         if (!rcu_access_pointer(rdi->worker))
 544                 return;
 545
 546         spin_lock(&rdi->n_cqs_lock);
 547         worker = rcu_dereference_protected(rdi->worker,
 548                                            lockdep_is_held(&rdi->n_cqs_lock));
 549         if (!worker) {
 550                 spin_unlock(&rdi->n_cqs_lock);
 551                 return;
 552         }
 553         RCU_INIT_POINTER(rdi->worker, NULL);
 554         spin_unlock(&rdi->n_cqs_lock);
 555         synchronize_rcu();
 556
 557         kthread_destroy_worker(worker);
 558 }