GNU Linux-libre 4.9.309-gnu1
[releases.git] / drivers / staging / lustre / lustre / llite / rw26.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2012, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  *
32  * lustre/lustre/llite/rw26.c
33  *
34  * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version
35  */
36
37 #include <linux/kernel.h>
38 #include <linux/mm.h>
39 #include <linux/string.h>
40 #include <linux/stat.h>
41 #include <linux/errno.h>
42 #include <linux/unistd.h>
43 #include <linux/uaccess.h>
44
45 #include <linux/migrate.h>
46 #include <linux/fs.h>
47 #include <linux/buffer_head.h>
48 #include <linux/mpage.h>
49 #include <linux/writeback.h>
50 #include <linux/pagemap.h>
51
52 #define DEBUG_SUBSYSTEM S_LLITE
53
54 #include "llite_internal.h"
55
56 /**
57  * Implements Linux VM address_space::invalidatepage() method. This method is
58  * called when the page is truncate from a file, either as a result of
59  * explicit truncate, or when inode is removed from memory (as a result of
60  * final iput(), umount, or memory pressure induced icache shrinking).
61  *
62  * [0, offset] bytes of the page remain valid (this is for a case of not-page
63  * aligned truncate). Lustre leaves partially truncated page in the cache,
64  * relying on struct inode::i_size to limit further accesses.
65  */
66 static void ll_invalidatepage(struct page *vmpage, unsigned int offset,
67                               unsigned int length)
68 {
69         struct inode     *inode;
70         struct lu_env    *env;
71         struct cl_page   *page;
72         struct cl_object *obj;
73
74         int refcheck;
75
76         LASSERT(PageLocked(vmpage));
77         LASSERT(!PageWriteback(vmpage));
78
79         /*
80          * It is safe to not check anything in invalidatepage/releasepage
81          * below because they are run with page locked and all our io is
82          * happening with locked page too
83          */
84         if (offset == 0 && length == PAGE_SIZE) {
85                 env = cl_env_get(&refcheck);
86                 if (!IS_ERR(env)) {
87                         inode = vmpage->mapping->host;
88                         obj = ll_i2info(inode)->lli_clob;
89                         if (obj) {
90                                 page = cl_vmpage_page(vmpage, obj);
91                                 if (page) {
92                                         cl_page_delete(env, page);
93                                         cl_page_put(env, page);
94                                 }
95                         } else {
96                                 LASSERT(vmpage->private == 0);
97                         }
98                         cl_env_put(env, &refcheck);
99                 }
100         }
101 }
102
103 static int ll_releasepage(struct page *vmpage, gfp_t gfp_mask)
104 {
105         struct lu_env     *env;
106         void                    *cookie;
107         struct cl_object  *obj;
108         struct cl_page    *page;
109         struct address_space *mapping;
110         int result = 0;
111
112         LASSERT(PageLocked(vmpage));
113         if (PageWriteback(vmpage) || PageDirty(vmpage))
114                 return 0;
115
116         mapping = vmpage->mapping;
117         if (!mapping)
118                 return 1;
119
120         obj = ll_i2info(mapping->host)->lli_clob;
121         if (!obj)
122                 return 1;
123
124         /* 1 for caller, 1 for cl_page and 1 for page cache */
125         if (page_count(vmpage) > 3)
126                 return 0;
127
128         page = cl_vmpage_page(vmpage, obj);
129         if (!page)
130                 return 1;
131
132         cookie = cl_env_reenter();
133         env = cl_env_percpu_get();
134         LASSERT(!IS_ERR(env));
135
136         if (!cl_page_in_use(page)) {
137                 result = 1;
138                 cl_page_delete(env, page);
139         }
140
141         /* To use percpu env array, the call path can not be rescheduled;
142          * otherwise percpu array will be messed if ll_releaspage() called
143          * again on the same CPU.
144          *
145          * If this page holds the last refc of cl_object, the following
146          * call path may cause reschedule:
147          *   cl_page_put -> cl_page_free -> cl_object_put ->
148          *     lu_object_put -> lu_object_free -> lov_delete_raid0.
149          *
150          * However, the kernel can't get rid of this inode until all pages have
151          * been cleaned up. Now that we hold page lock here, it's pretty safe
152          * that we won't get into object delete path.
153          */
154         LASSERT(cl_object_refc(obj) > 1);
155         cl_page_put(env, page);
156
157         cl_env_percpu_put(env);
158         cl_env_reexit(cookie);
159         return result;
160 }
161
162 #define MAX_DIRECTIO_SIZE (2 * 1024 * 1024 * 1024UL)
163
164 static inline int ll_get_user_pages(int rw, unsigned long user_addr,
165                                     size_t size, struct page ***pages,
166                                     int *max_pages)
167 {
168         int result = -ENOMEM;
169
170         /* set an arbitrary limit to prevent arithmetic overflow */
171         if (size > MAX_DIRECTIO_SIZE) {
172                 *pages = NULL;
173                 return -EFBIG;
174         }
175
176         *max_pages = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
177         *max_pages -= user_addr >> PAGE_SHIFT;
178
179         *pages = libcfs_kvzalloc(*max_pages * sizeof(**pages), GFP_NOFS);
180         if (*pages) {
181                 result = get_user_pages_fast(user_addr, *max_pages,
182                                              (rw == READ), *pages);
183                 if (unlikely(result <= 0))
184                         kvfree(*pages);
185         }
186
187         return result;
188 }
189
190 /*  ll_free_user_pages - tear down page struct array
191  *  @pages: array of page struct pointers underlying target buffer
192  */
193 static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
194 {
195         int i;
196
197         for (i = 0; i < npages; i++) {
198                 if (do_dirty)
199                         set_page_dirty_lock(pages[i]);
200                 put_page(pages[i]);
201         }
202         kvfree(pages);
203 }
204
205 ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io,
206                            int rw, struct inode *inode,
207                            struct ll_dio_pages *pv)
208 {
209         struct cl_page    *clp;
210         struct cl_2queue  *queue;
211         struct cl_object  *obj = io->ci_obj;
212         int i;
213         ssize_t rc = 0;
214         loff_t file_offset  = pv->ldp_start_offset;
215         size_t size = pv->ldp_size;
216         int page_count      = pv->ldp_nr;
217         struct page **pages = pv->ldp_pages;
218         size_t page_size = cl_page_size(obj);
219         bool do_io;
220         int  io_pages       = 0;
221
222         queue = &io->ci_queue;
223         cl_2queue_init(queue);
224         for (i = 0; i < page_count; i++) {
225                 if (pv->ldp_offsets)
226                         file_offset = pv->ldp_offsets[i];
227
228                 LASSERT(!(file_offset & (page_size - 1)));
229                 clp = cl_page_find(env, obj, cl_index(obj, file_offset),
230                                    pv->ldp_pages[i], CPT_TRANSIENT);
231                 if (IS_ERR(clp)) {
232                         rc = PTR_ERR(clp);
233                         break;
234                 }
235
236                 rc = cl_page_own(env, io, clp);
237                 if (rc) {
238                         LASSERT(clp->cp_state == CPS_FREEING);
239                         cl_page_put(env, clp);
240                         break;
241                 }
242
243                 do_io = true;
244
245                 /* check the page type: if the page is a host page, then do
246                  * write directly
247                  */
248                 if (clp->cp_type == CPT_CACHEABLE) {
249                         struct page *vmpage = cl_page_vmpage(clp);
250                         struct page *src_page;
251                         struct page *dst_page;
252                         void       *src;
253                         void       *dst;
254
255                         src_page = (rw == WRITE) ? pages[i] : vmpage;
256                         dst_page = (rw == WRITE) ? vmpage : pages[i];
257
258                         src = kmap_atomic(src_page);
259                         dst = kmap_atomic(dst_page);
260                         memcpy(dst, src, min(page_size, size));
261                         kunmap_atomic(dst);
262                         kunmap_atomic(src);
263
264                         /* make sure page will be added to the transfer by
265                          * cl_io_submit()->...->vvp_page_prep_write().
266                          */
267                         if (rw == WRITE)
268                                 set_page_dirty(vmpage);
269
270                         if (rw == READ) {
271                                 /* do not issue the page for read, since it
272                                  * may reread a ra page which has NOT uptodate
273                                  * bit set.
274                                  */
275                                 cl_page_disown(env, io, clp);
276                                 do_io = false;
277                         }
278                 }
279
280                 if (likely(do_io)) {
281                         /*
282                          * Add a page to the incoming page list of 2-queue.
283                          */
284                         cl_page_list_add(&queue->c2_qin, clp);
285
286                         /*
287                          * Set page clip to tell transfer formation engine
288                          * that page has to be sent even if it is beyond KMS.
289                          */
290                         cl_page_clip(env, clp, 0, min(size, page_size));
291
292                         ++io_pages;
293                 }
294
295                 /* drop the reference count for cl_page_find */
296                 cl_page_put(env, clp);
297                 size -= page_size;
298                 file_offset += page_size;
299         }
300
301         if (rc == 0 && io_pages) {
302                 rc = cl_io_submit_sync(env, io,
303                                        rw == READ ? CRT_READ : CRT_WRITE,
304                                        queue, 0);
305         }
306         if (rc == 0)
307                 rc = pv->ldp_size;
308
309         cl_2queue_discard(env, io, queue);
310         cl_2queue_disown(env, io, queue);
311         cl_2queue_fini(env, queue);
312         return rc;
313 }
314 EXPORT_SYMBOL(ll_direct_rw_pages);
315
316 static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
317                                    int rw, struct inode *inode,
318                                    struct address_space *mapping,
319                                    size_t size, loff_t file_offset,
320                                    struct page **pages, int page_count)
321 {
322         struct ll_dio_pages pvec = {
323                 .ldp_pages      = pages,
324                 .ldp_nr         = page_count,
325                 .ldp_size       = size,
326                 .ldp_offsets    = NULL,
327                 .ldp_start_offset = file_offset
328         };
329
330         return ll_direct_rw_pages(env, io, rw, inode, &pvec);
331 }
332
333 /* This is the maximum size of a single O_DIRECT request, based on the
334  * kmalloc limit.  We need to fit all of the brw_page structs, each one
335  * representing PAGE_SIZE worth of user data, into a single buffer, and
336  * then truncate this to be a full-sized RPC.  For 4kB PAGE_SIZE this is
337  * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc.
338  */
339 #define MAX_DIO_SIZE ((KMALLOC_MAX_SIZE / sizeof(struct brw_page) *       \
340                        PAGE_SIZE) & ~(DT_MAX_BRW_SIZE - 1))
341 static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter)
342 {
343         struct lu_env *env;
344         struct cl_io *io;
345         struct file *file = iocb->ki_filp;
346         struct inode *inode = file->f_mapping->host;
347         loff_t file_offset = iocb->ki_pos;
348         ssize_t count = iov_iter_count(iter);
349         ssize_t tot_bytes = 0, result = 0;
350         struct ll_inode_info *lli = ll_i2info(inode);
351         long size = MAX_DIO_SIZE;
352         int refcheck;
353
354         if (!lli->lli_has_smd)
355                 return -EBADF;
356
357         /* Check EOF by ourselves */
358         if (iov_iter_rw(iter) == READ && file_offset >= i_size_read(inode))
359                 return 0;
360
361         /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
362         if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK))
363                 return -EINVAL;
364
365         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)\n",
366                PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE,
367                file_offset, file_offset, count >> PAGE_SHIFT,
368                MAX_DIO_SIZE >> PAGE_SHIFT);
369
370         /* Check that all user buffers are aligned as well */
371         if (iov_iter_alignment(iter) & ~PAGE_MASK)
372                 return -EINVAL;
373
374         env = cl_env_get(&refcheck);
375         LASSERT(!IS_ERR(env));
376         io = vvp_env_io(env)->vui_cl.cis_io;
377         LASSERT(io);
378
379         while (iov_iter_count(iter)) {
380                 struct page **pages;
381                 size_t offs;
382
383                 count = min_t(size_t, iov_iter_count(iter), size);
384                 if (iov_iter_rw(iter) == READ) {
385                         if (file_offset >= i_size_read(inode))
386                                 break;
387                         if (file_offset + count > i_size_read(inode))
388                                 count = i_size_read(inode) - file_offset;
389                 }
390
391                 result = iov_iter_get_pages_alloc(iter, &pages, count, &offs);
392                 if (likely(result > 0)) {
393                         int n = DIV_ROUND_UP(result + offs, PAGE_SIZE);
394
395                         result = ll_direct_IO_26_seg(env, io, iov_iter_rw(iter),
396                                                      inode, file->f_mapping,
397                                                      result, file_offset, pages,
398                                                      n);
399                         ll_free_user_pages(pages, n, iov_iter_rw(iter) == READ);
400                 }
401                 if (unlikely(result <= 0)) {
402                         /* If we can't allocate a large enough buffer
403                          * for the request, shrink it to a smaller
404                          * PAGE_SIZE multiple and try again.
405                          * We should always be able to kmalloc for a
406                          * page worth of page pointers = 4MB on i386.
407                          */
408                         if (result == -ENOMEM &&
409                             size > (PAGE_SIZE / sizeof(*pages)) *
410                             PAGE_SIZE) {
411                                 size = ((((size / 2) - 1) |
412                                          ~PAGE_MASK) + 1) &
413                                         PAGE_MASK;
414                                 CDEBUG(D_VFSTRACE, "DIO size now %lu\n",
415                                        size);
416                                 continue;
417                         }
418
419                         goto out;
420                 }
421                 iov_iter_advance(iter, result);
422                 tot_bytes += result;
423                 file_offset += result;
424         }
425 out:
426         if (tot_bytes > 0) {
427                 struct vvp_io *vio = vvp_env_io(env);
428
429                 /* no commit async for direct IO */
430                 vio->u.write.vui_written += tot_bytes;
431         }
432
433         cl_env_put(env, &refcheck);
434         return tot_bytes ? tot_bytes : result;
435 }
436
437 /**
438  * Prepare partially written-to page for a write.
439  */
440 static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io,
441                                    struct cl_page *pg)
442 {
443         struct cl_attr *attr   = vvp_env_thread_attr(env);
444         struct cl_object *obj  = io->ci_obj;
445         struct vvp_page *vpg   = cl_object_page_slice(obj, pg);
446         loff_t          offset = cl_offset(obj, vvp_index(vpg));
447         int             result;
448
449         cl_object_attr_lock(obj);
450         result = cl_object_attr_get(env, obj, attr);
451         cl_object_attr_unlock(obj);
452         if (result == 0) {
453                 /*
454                  * If are writing to a new page, no need to read old data.
455                  * The extent locking will have updated the KMS, and for our
456                  * purposes here we can treat it like i_size.
457                  */
458                 if (attr->cat_kms <= offset) {
459                         char *kaddr = kmap_atomic(vpg->vpg_page);
460
461                         memset(kaddr, 0, cl_page_size(obj));
462                         kunmap_atomic(kaddr);
463                 } else if (vpg->vpg_defer_uptodate) {
464                         vpg->vpg_ra_used = 1;
465                 } else {
466                         result = ll_page_sync_io(env, io, pg, CRT_READ);
467                 }
468         }
469         return result;
470 }
471
472 static int ll_write_begin(struct file *file, struct address_space *mapping,
473                           loff_t pos, unsigned len, unsigned flags,
474                           struct page **pagep, void **fsdata)
475 {
476         struct ll_cl_context *lcc;
477         const struct lu_env  *env;
478         struct cl_io   *io;
479         struct cl_page *page;
480         struct cl_object *clob = ll_i2info(mapping->host)->lli_clob;
481         pgoff_t index = pos >> PAGE_SHIFT;
482         struct page *vmpage = NULL;
483         unsigned int from = pos & (PAGE_SIZE - 1);
484         unsigned int to = from + len;
485         int result = 0;
486
487         CDEBUG(D_VFSTRACE, "Writing %lu of %d to %d bytes\n", index, from, len);
488
489         lcc = ll_cl_find(file);
490         if (!lcc) {
491                 result = -EIO;
492                 goto out;
493         }
494
495         env = lcc->lcc_env;
496         io  = lcc->lcc_io;
497
498         /* To avoid deadlock, try to lock page first. */
499         vmpage = grab_cache_page_nowait(mapping, index);
500         if (unlikely(!vmpage || PageDirty(vmpage) || PageWriteback(vmpage))) {
501                 struct vvp_io *vio = vvp_env_io(env);
502                 struct cl_page_list *plist = &vio->u.write.vui_queue;
503
504                 /* if the page is already in dirty cache, we have to commit
505                  * the pages right now; otherwise, it may cause deadlock
506                  * because it holds page lock of a dirty page and request for
507                  * more grants. It's okay for the dirty page to be the first
508                  * one in commit page list, though.
509                  */
510                 if (vmpage && plist->pl_nr > 0) {
511                         unlock_page(vmpage);
512                         put_page(vmpage);
513                         vmpage = NULL;
514                 }
515
516                 /* commit pages and then wait for page lock */
517                 result = vvp_io_write_commit(env, io);
518                 if (result < 0)
519                         goto out;
520
521                 if (!vmpage) {
522                         vmpage = grab_cache_page_write_begin(mapping, index,
523                                                              flags);
524                         if (!vmpage) {
525                                 result = -ENOMEM;
526                                 goto out;
527                         }
528                 }
529         }
530
531         page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE);
532         if (IS_ERR(page)) {
533                 result = PTR_ERR(page);
534                 goto out;
535         }
536
537         lcc->lcc_page = page;
538         lu_ref_add(&page->cp_reference, "cl_io", io);
539
540         cl_page_assume(env, io, page);
541         if (!PageUptodate(vmpage)) {
542                 /*
543                  * We're completely overwriting an existing page,
544                  * so _don't_ set it up to date until commit_write
545                  */
546                 if (from == 0 && to == PAGE_SIZE) {
547                         CL_PAGE_HEADER(D_PAGE, env, page, "full page write\n");
548                         POISON_PAGE(vmpage, 0x11);
549                 } else {
550                         /* TODO: can be optimized at OSC layer to check if it
551                          * is a lockless IO. In that case, it's not necessary
552                          * to read the data.
553                          */
554                         result = ll_prepare_partial_page(env, io, page);
555                         if (result == 0)
556                                 SetPageUptodate(vmpage);
557                 }
558         }
559         if (result < 0)
560                 cl_page_unassume(env, io, page);
561 out:
562         if (result < 0) {
563                 if (vmpage) {
564                         unlock_page(vmpage);
565                         put_page(vmpage);
566                 }
567         } else {
568                 *pagep = vmpage;
569                 *fsdata = lcc;
570         }
571         return result;
572 }
573
574 static int ll_write_end(struct file *file, struct address_space *mapping,
575                         loff_t pos, unsigned len, unsigned copied,
576                         struct page *vmpage, void *fsdata)
577 {
578         struct ll_cl_context *lcc = fsdata;
579         const struct lu_env *env;
580         struct cl_io *io;
581         struct vvp_io *vio;
582         struct cl_page *page;
583         unsigned from = pos & (PAGE_SIZE - 1);
584         bool unplug = false;
585         int result = 0;
586
587         put_page(vmpage);
588
589         env  = lcc->lcc_env;
590         page = lcc->lcc_page;
591         io   = lcc->lcc_io;
592         vio  = vvp_env_io(env);
593
594         LASSERT(cl_page_is_owned(page, io));
595         if (copied > 0) {
596                 struct cl_page_list *plist = &vio->u.write.vui_queue;
597
598                 lcc->lcc_page = NULL; /* page will be queued */
599
600                 /* Add it into write queue */
601                 cl_page_list_add(plist, page);
602                 if (plist->pl_nr == 1) /* first page */
603                         vio->u.write.vui_from = from;
604                 else
605                         LASSERT(from == 0);
606                 vio->u.write.vui_to = from + copied;
607
608                 /*
609                  * To address the deadlock in balance_dirty_pages() where
610                  * this dirty page may be written back in the same thread.
611                  */
612                 if (PageDirty(vmpage))
613                         unplug = true;
614
615                 /* We may have one full RPC, commit it soon */
616                 if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES)
617                         unplug = true;
618
619                 CL_PAGE_DEBUG(D_VFSTRACE, env, page,
620                               "queued page: %d.\n", plist->pl_nr);
621         } else {
622                 cl_page_disown(env, io, page);
623
624                 lcc->lcc_page = NULL;
625                 lu_ref_del(&page->cp_reference, "cl_io", io);
626                 cl_page_put(env, page);
627
628                 /* page list is not contiguous now, commit it now */
629                 unplug = true;
630         }
631
632         if (unplug ||
633             file->f_flags & O_SYNC || IS_SYNC(file_inode(file)))
634                 result = vvp_io_write_commit(env, io);
635
636         return result >= 0 ? copied : result;
637 }
638
639 #ifdef CONFIG_MIGRATION
640 static int ll_migratepage(struct address_space *mapping,
641                           struct page *newpage, struct page *page,
642                           enum migrate_mode mode
643                 )
644 {
645         /* Always fail page migration until we have a proper implementation */
646         return -EIO;
647 }
648 #endif
649
650 const struct address_space_operations ll_aops = {
651         .readpage       = ll_readpage,
652         .direct_IO      = ll_direct_IO_26,
653         .writepage      = ll_writepage,
654         .writepages     = ll_writepages,
655         .set_page_dirty = __set_page_dirty_nobuffers,
656         .write_begin    = ll_write_begin,
657         .write_end      = ll_write_end,
658         .invalidatepage = ll_invalidatepage,
659         .releasepage    = (void *)ll_releasepage,
660 #ifdef CONFIG_MIGRATION
661         .migratepage    = ll_migratepage,
662 #endif
663 };