Merge tag 'x86_microcode_for_5.8' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / fs / nfs / pnfs.c
1 /*
2  *  pNFS functions to call and manage layout drivers.
3  *
4  *  Copyright (c) 2002 [year of first publication]
5  *  The Regents of the University of Michigan
6  *  All Rights Reserved
7  *
8  *  Dean Hildebrand <dhildebz@umich.edu>
9  *
10  *  Permission is granted to use, copy, create derivative works, and
11  *  redistribute this software and such derivative works for any purpose,
12  *  so long as the name of the University of Michigan is not used in
13  *  any advertising or publicity pertaining to the use or distribution
14  *  of this software without specific, written prior authorization. If
15  *  the above copyright notice or any other identification of the
16  *  University of Michigan is included in any copy of any portion of
17  *  this software, then the disclaimer below must also be included.
18  *
19  *  This software is provided as is, without representation or warranty
20  *  of any kind either express or implied, including without limitation
21  *  the implied warranties of merchantability, fitness for a particular
22  *  purpose, or noninfringement.  The Regents of the University of
23  *  Michigan shall not be liable for any damages, including special,
24  *  indirect, incidental, or consequential damages, with respect to any
25  *  claim arising out of or in connection with the use of the software,
26  *  even if it has been or is hereafter advised of the possibility of
27  *  such damages.
28  */
29
30 #include <linux/nfs_fs.h>
31 #include <linux/nfs_page.h>
32 #include <linux/module.h>
33 #include <linux/sort.h>
34 #include "internal.h"
35 #include "pnfs.h"
36 #include "iostat.h"
37 #include "nfs4trace.h"
38 #include "delegation.h"
39 #include "nfs42.h"
40 #include "nfs4_fs.h"
41
42 #define NFSDBG_FACILITY         NFSDBG_PNFS
43 #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
44
45 /* Locking:
46  *
47  * pnfs_spinlock:
48  *      protects pnfs_modules_tbl.
49  */
50 static DEFINE_SPINLOCK(pnfs_spinlock);
51
52 /*
53  * pnfs_modules_tbl holds all pnfs modules
54  */
55 static LIST_HEAD(pnfs_modules_tbl);
56
57 static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
58 static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
59                 struct list_head *free_me,
60                 const struct pnfs_layout_range *range,
61                 u32 seq);
62 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
63                                 struct list_head *tmp_list);
64
65 /* Return the registered pnfs layout driver module matching given id */
66 static struct pnfs_layoutdriver_type *
67 find_pnfs_driver_locked(u32 id)
68 {
69         struct pnfs_layoutdriver_type *local;
70
71         list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
72                 if (local->id == id)
73                         goto out;
74         local = NULL;
75 out:
76         dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
77         return local;
78 }
79
80 static struct pnfs_layoutdriver_type *
81 find_pnfs_driver(u32 id)
82 {
83         struct pnfs_layoutdriver_type *local;
84
85         spin_lock(&pnfs_spinlock);
86         local = find_pnfs_driver_locked(id);
87         if (local != NULL && !try_module_get(local->owner)) {
88                 dprintk("%s: Could not grab reference on module\n", __func__);
89                 local = NULL;
90         }
91         spin_unlock(&pnfs_spinlock);
92         return local;
93 }
94
95 void
96 unset_pnfs_layoutdriver(struct nfs_server *nfss)
97 {
98         if (nfss->pnfs_curr_ld) {
99                 if (nfss->pnfs_curr_ld->clear_layoutdriver)
100                         nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
101                 /* Decrement the MDS count. Purge the deviceid cache if zero */
102                 if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
103                         nfs4_deviceid_purge_client(nfss->nfs_client);
104                 module_put(nfss->pnfs_curr_ld->owner);
105         }
106         nfss->pnfs_curr_ld = NULL;
107 }
108
109 /*
110  * When the server sends a list of layout types, we choose one in the order
111  * given in the list below.
112  *
113  * FIXME: should this list be configurable in some fashion? module param?
114  *        mount option? something else?
115  */
116 static const u32 ld_prefs[] = {
117         LAYOUT_SCSI,
118         LAYOUT_BLOCK_VOLUME,
119         LAYOUT_OSD2_OBJECTS,
120         LAYOUT_FLEX_FILES,
121         LAYOUT_NFSV4_1_FILES,
122         0
123 };
124
125 static int
126 ld_cmp(const void *e1, const void *e2)
127 {
128         u32 ld1 = *((u32 *)e1);
129         u32 ld2 = *((u32 *)e2);
130         int i;
131
132         for (i = 0; ld_prefs[i] != 0; i++) {
133                 if (ld1 == ld_prefs[i])
134                         return -1;
135
136                 if (ld2 == ld_prefs[i])
137                         return 1;
138         }
139         return 0;
140 }
141
142 /*
143  * Try to set the server's pnfs module to the pnfs layout type specified by id.
144  * Currently only one pNFS layout driver per filesystem is supported.
145  *
146  * @ids array of layout types supported by MDS.
147  */
148 void
149 set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
150                       struct nfs_fsinfo *fsinfo)
151 {
152         struct pnfs_layoutdriver_type *ld_type = NULL;
153         u32 id;
154         int i;
155
156         if (fsinfo->nlayouttypes == 0)
157                 goto out_no_driver;
158         if (!(server->nfs_client->cl_exchange_flags &
159                  (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
160                 printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
161                         __func__, server->nfs_client->cl_exchange_flags);
162                 goto out_no_driver;
163         }
164
165         sort(fsinfo->layouttype, fsinfo->nlayouttypes,
166                 sizeof(*fsinfo->layouttype), ld_cmp, NULL);
167
168         for (i = 0; i < fsinfo->nlayouttypes; i++) {
169                 id = fsinfo->layouttype[i];
170                 ld_type = find_pnfs_driver(id);
171                 if (!ld_type) {
172                         request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
173                                         id);
174                         ld_type = find_pnfs_driver(id);
175                 }
176                 if (ld_type)
177                         break;
178         }
179
180         if (!ld_type) {
181                 dprintk("%s: No pNFS module found!\n", __func__);
182                 goto out_no_driver;
183         }
184
185         server->pnfs_curr_ld = ld_type;
186         if (ld_type->set_layoutdriver
187             && ld_type->set_layoutdriver(server, mntfh)) {
188                 printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
189                         "driver %u.\n", __func__, id);
190                 module_put(ld_type->owner);
191                 goto out_no_driver;
192         }
193         /* Bump the MDS count */
194         atomic_inc(&server->nfs_client->cl_mds_count);
195
196         dprintk("%s: pNFS module for %u set\n", __func__, id);
197         return;
198
199 out_no_driver:
200         dprintk("%s: Using NFSv4 I/O\n", __func__);
201         server->pnfs_curr_ld = NULL;
202 }
203
204 int
205 pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
206 {
207         int status = -EINVAL;
208         struct pnfs_layoutdriver_type *tmp;
209
210         if (ld_type->id == 0) {
211                 printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
212                 return status;
213         }
214         if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
215                 printk(KERN_ERR "NFS: %s Layout driver must provide "
216                        "alloc_lseg and free_lseg.\n", __func__);
217                 return status;
218         }
219
220         spin_lock(&pnfs_spinlock);
221         tmp = find_pnfs_driver_locked(ld_type->id);
222         if (!tmp) {
223                 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
224                 status = 0;
225                 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
226                         ld_type->name);
227         } else {
228                 printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
229                         __func__, ld_type->id);
230         }
231         spin_unlock(&pnfs_spinlock);
232
233         return status;
234 }
235 EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
236
237 void
238 pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
239 {
240         dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
241         spin_lock(&pnfs_spinlock);
242         list_del(&ld_type->pnfs_tblid);
243         spin_unlock(&pnfs_spinlock);
244 }
245 EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
246
247 /*
248  * pNFS client layout cache
249  */
250
251 /* Need to hold i_lock if caller does not already hold reference */
252 void
253 pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
254 {
255         refcount_inc(&lo->plh_refcount);
256 }
257
258 static struct pnfs_layout_hdr *
259 pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
260 {
261         struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
262         return ld->alloc_layout_hdr(ino, gfp_flags);
263 }
264
265 static void
266 pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
267 {
268         struct nfs_server *server = NFS_SERVER(lo->plh_inode);
269         struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
270
271         if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
272                 struct nfs_client *clp = server->nfs_client;
273
274                 spin_lock(&clp->cl_lock);
275                 list_del_rcu(&lo->plh_layouts);
276                 spin_unlock(&clp->cl_lock);
277         }
278         put_cred(lo->plh_lc_cred);
279         return ld->free_layout_hdr(lo);
280 }
281
282 static void
283 pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
284 {
285         struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
286         dprintk("%s: freeing layout cache %p\n", __func__, lo);
287         nfsi->layout = NULL;
288         /* Reset MDS Threshold I/O counters */
289         nfsi->write_io = 0;
290         nfsi->read_io = 0;
291 }
292
293 void
294 pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
295 {
296         struct inode *inode;
297
298         if (!lo)
299                 return;
300         inode = lo->plh_inode;
301         pnfs_layoutreturn_before_put_layout_hdr(lo);
302
303         if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
304                 if (!list_empty(&lo->plh_segs))
305                         WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
306                 pnfs_detach_layout_hdr(lo);
307                 spin_unlock(&inode->i_lock);
308                 pnfs_free_layout_hdr(lo);
309         }
310 }
311
312 static struct inode *
313 pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo)
314 {
315         struct inode *inode = igrab(lo->plh_inode);
316         if (inode)
317                 return inode;
318         set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
319         return NULL;
320 }
321
322 static void
323 pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
324                          u32 seq)
325 {
326         if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
327                 iomode = IOMODE_ANY;
328         lo->plh_return_iomode = iomode;
329         set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
330         if (seq != 0) {
331                 WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
332                 lo->plh_return_seq = seq;
333         }
334 }
335
336 static void
337 pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
338 {
339         struct pnfs_layout_segment *lseg;
340         lo->plh_return_iomode = 0;
341         lo->plh_return_seq = 0;
342         clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
343         list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
344                 if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
345                         continue;
346                 pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
347         }
348 }
349
350 static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
351 {
352         clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
353         clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags);
354         smp_mb__after_atomic();
355         wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
356         rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
357 }
358
359 static void
360 pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
361                 struct list_head *free_me)
362 {
363         clear_bit(NFS_LSEG_ROC, &lseg->pls_flags);
364         clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
365         if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags))
366                 pnfs_lseg_dec_and_remove_zero(lseg, free_me);
367         if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
368                 pnfs_lseg_dec_and_remove_zero(lseg, free_me);
369 }
370
371 /*
372  * Update the seqid of a layout stateid after receiving
373  * NFS4ERR_OLD_STATEID
374  */
375 bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
376                 struct pnfs_layout_range *dst_range,
377                 struct inode *inode)
378 {
379         struct pnfs_layout_hdr *lo;
380         struct pnfs_layout_range range = {
381                 .iomode = IOMODE_ANY,
382                 .offset = 0,
383                 .length = NFS4_MAX_UINT64,
384         };
385         bool ret = false;
386         LIST_HEAD(head);
387         int err;
388
389         spin_lock(&inode->i_lock);
390         lo = NFS_I(inode)->layout;
391         if (lo &&  pnfs_layout_is_valid(lo) &&
392             nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
393                 /* Is our call using the most recent seqid? If so, bump it */
394                 if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) {
395                         nfs4_stateid_seqid_inc(dst);
396                         ret = true;
397                         goto out;
398                 }
399                 /* Try to update the seqid to the most recent */
400                 err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
401                 if (err != -EBUSY) {
402                         dst->seqid = lo->plh_stateid.seqid;
403                         *dst_range = range;
404                         ret = true;
405                 }
406         }
407 out:
408         spin_unlock(&inode->i_lock);
409         pnfs_free_lseg_list(&head);
410         return ret;
411 }
412
413 /*
414  * Mark a pnfs_layout_hdr and all associated layout segments as invalid
415  *
416  * In order to continue using the pnfs_layout_hdr, a full recovery
417  * is required.
418  * Note that caller must hold inode->i_lock.
419  */
420 int
421 pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
422                 struct list_head *lseg_list)
423 {
424         struct pnfs_layout_range range = {
425                 .iomode = IOMODE_ANY,
426                 .offset = 0,
427                 .length = NFS4_MAX_UINT64,
428         };
429         struct pnfs_layout_segment *lseg, *next;
430
431         set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
432         list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
433                 pnfs_clear_lseg_state(lseg, lseg_list);
434         pnfs_clear_layoutreturn_info(lo);
435         pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
436         if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
437             !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
438                 pnfs_clear_layoutreturn_waitbit(lo);
439         return !list_empty(&lo->plh_segs);
440 }
441
442 static int
443 pnfs_iomode_to_fail_bit(u32 iomode)
444 {
445         return iomode == IOMODE_RW ?
446                 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
447 }
448
449 static void
450 pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
451 {
452         lo->plh_retry_timestamp = jiffies;
453         if (!test_and_set_bit(fail_bit, &lo->plh_flags))
454                 refcount_inc(&lo->plh_refcount);
455 }
456
457 static void
458 pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
459 {
460         if (test_and_clear_bit(fail_bit, &lo->plh_flags))
461                 refcount_dec(&lo->plh_refcount);
462 }
463
464 static void
465 pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
466 {
467         struct inode *inode = lo->plh_inode;
468         struct pnfs_layout_range range = {
469                 .iomode = iomode,
470                 .offset = 0,
471                 .length = NFS4_MAX_UINT64,
472         };
473         LIST_HEAD(head);
474
475         spin_lock(&inode->i_lock);
476         pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
477         pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
478         spin_unlock(&inode->i_lock);
479         pnfs_free_lseg_list(&head);
480         dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
481                         iomode == IOMODE_RW ?  "RW" : "READ");
482 }
483
484 static bool
485 pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
486 {
487         unsigned long start, end;
488         int fail_bit = pnfs_iomode_to_fail_bit(iomode);
489
490         if (test_bit(fail_bit, &lo->plh_flags) == 0)
491                 return false;
492         end = jiffies;
493         start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
494         if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
495                 /* It is time to retry the failed layoutgets */
496                 pnfs_layout_clear_fail_bit(lo, fail_bit);
497                 return false;
498         }
499         return true;
500 }
501
502 static void
503 pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
504                 const struct pnfs_layout_range *range,
505                 const nfs4_stateid *stateid)
506 {
507         INIT_LIST_HEAD(&lseg->pls_list);
508         INIT_LIST_HEAD(&lseg->pls_lc_list);
509         INIT_LIST_HEAD(&lseg->pls_commits);
510         refcount_set(&lseg->pls_refcount, 1);
511         set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
512         lseg->pls_layout = lo;
513         lseg->pls_range = *range;
514         lseg->pls_seq = be32_to_cpu(stateid->seqid);
515 }
516
517 static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
518 {
519         if (lseg != NULL) {
520                 struct inode *inode = lseg->pls_layout->plh_inode;
521                 NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg);
522         }
523 }
524
525 static void
526 pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
527                 struct pnfs_layout_segment *lseg)
528 {
529         WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
530         list_del_init(&lseg->pls_list);
531         /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
532         refcount_dec(&lo->plh_refcount);
533         if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
534                 return;
535         if (list_empty(&lo->plh_segs) &&
536             !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
537             !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
538                 if (atomic_read(&lo->plh_outstanding) == 0)
539                         set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
540                 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
541         }
542 }
543
544 static bool
545 pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo,
546                 struct pnfs_layout_segment *lseg)
547 {
548         if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
549             pnfs_layout_is_valid(lo)) {
550                 pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
551                 list_move_tail(&lseg->pls_list, &lo->plh_return_segs);
552                 return true;
553         }
554         return false;
555 }
556
557 void
558 pnfs_put_lseg(struct pnfs_layout_segment *lseg)
559 {
560         struct pnfs_layout_hdr *lo;
561         struct inode *inode;
562
563         if (!lseg)
564                 return;
565
566         dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
567                 refcount_read(&lseg->pls_refcount),
568                 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
569
570         lo = lseg->pls_layout;
571         inode = lo->plh_inode;
572
573         if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
574                 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
575                         spin_unlock(&inode->i_lock);
576                         return;
577                 }
578                 pnfs_get_layout_hdr(lo);
579                 pnfs_layout_remove_lseg(lo, lseg);
580                 if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
581                         lseg = NULL;
582                 spin_unlock(&inode->i_lock);
583                 pnfs_free_lseg(lseg);
584                 pnfs_put_layout_hdr(lo);
585         }
586 }
587 EXPORT_SYMBOL_GPL(pnfs_put_lseg);
588
589 /*
590  * is l2 fully contained in l1?
591  *   start1                             end1
592  *   [----------------------------------)
593  *           start2           end2
594  *           [----------------)
595  */
596 static bool
597 pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
598                  const struct pnfs_layout_range *l2)
599 {
600         u64 start1 = l1->offset;
601         u64 end1 = pnfs_end_offset(start1, l1->length);
602         u64 start2 = l2->offset;
603         u64 end2 = pnfs_end_offset(start2, l2->length);
604
605         return (start1 <= start2) && (end1 >= end2);
606 }
607
608 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
609                 struct list_head *tmp_list)
610 {
611         if (!refcount_dec_and_test(&lseg->pls_refcount))
612                 return false;
613         pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
614         list_add(&lseg->pls_list, tmp_list);
615         return true;
616 }
617
618 /* Returns 1 if lseg is removed from list, 0 otherwise */
619 static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
620                              struct list_head *tmp_list)
621 {
622         int rv = 0;
623
624         if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
625                 /* Remove the reference keeping the lseg in the
626                  * list.  It will now be removed when all
627                  * outstanding io is finished.
628                  */
629                 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
630                         refcount_read(&lseg->pls_refcount));
631                 if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
632                         rv = 1;
633         }
634         return rv;
635 }
636
637 /*
638  * Compare 2 layout stateid sequence ids, to see which is newer,
639  * taking into account wraparound issues.
640  */
641 static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
642 {
643         return (s32)(s1 - s2) > 0;
644 }
645
646 static bool
647 pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
648                  const struct pnfs_layout_range *recall_range)
649 {
650         return (recall_range->iomode == IOMODE_ANY ||
651                 lseg_range->iomode == recall_range->iomode) &&
652                pnfs_lseg_range_intersecting(lseg_range, recall_range);
653 }
654
655 static bool
656 pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
657                 const struct pnfs_layout_range *recall_range,
658                 u32 seq)
659 {
660         if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
661                 return false;
662         if (recall_range == NULL)
663                 return true;
664         return pnfs_should_free_range(&lseg->pls_range, recall_range);
665 }
666
667 /**
668  * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
669  * @lo: layout header containing the lsegs
670  * @tmp_list: list head where doomed lsegs should go
671  * @recall_range: optional recall range argument to match (may be NULL)
672  * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
673  *
674  * Walk the list of lsegs in the layout header, and tear down any that should
675  * be destroyed. If "recall_range" is specified then the segment must match
676  * that range. If "seq" is non-zero, then only match segments that were handed
677  * out at or before that sequence.
678  *
679  * Returns number of matching invalid lsegs remaining in list after scanning
680  * it and purging them.
681  */
682 int
683 pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
684                             struct list_head *tmp_list,
685                             const struct pnfs_layout_range *recall_range,
686                             u32 seq)
687 {
688         struct pnfs_layout_segment *lseg, *next;
689         int remaining = 0;
690
691         dprintk("%s:Begin lo %p\n", __func__, lo);
692
693         if (list_empty(&lo->plh_segs))
694                 return 0;
695         list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
696                 if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
697                         dprintk("%s: freeing lseg %p iomode %d seq %u "
698                                 "offset %llu length %llu\n", __func__,
699                                 lseg, lseg->pls_range.iomode, lseg->pls_seq,
700                                 lseg->pls_range.offset, lseg->pls_range.length);
701                         if (!mark_lseg_invalid(lseg, tmp_list))
702                                 remaining++;
703                 }
704         dprintk("%s:Return %i\n", __func__, remaining);
705         return remaining;
706 }
707
708 static void
709 pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
710                 struct list_head *free_me,
711                 const struct pnfs_layout_range *range,
712                 u32 seq)
713 {
714         struct pnfs_layout_segment *lseg, *next;
715
716         list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) {
717                 if (pnfs_match_lseg_recall(lseg, range, seq))
718                         list_move_tail(&lseg->pls_list, free_me);
719         }
720 }
721
722 /* note free_me must contain lsegs from a single layout_hdr */
723 void
724 pnfs_free_lseg_list(struct list_head *free_me)
725 {
726         struct pnfs_layout_segment *lseg, *tmp;
727
728         if (list_empty(free_me))
729                 return;
730
731         list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
732                 list_del(&lseg->pls_list);
733                 pnfs_free_lseg(lseg);
734         }
735 }
736
737 void
738 pnfs_destroy_layout(struct nfs_inode *nfsi)
739 {
740         struct pnfs_layout_hdr *lo;
741         LIST_HEAD(tmp_list);
742
743         spin_lock(&nfsi->vfs_inode.i_lock);
744         lo = nfsi->layout;
745         if (lo) {
746                 pnfs_get_layout_hdr(lo);
747                 pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
748                 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
749                 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
750                 spin_unlock(&nfsi->vfs_inode.i_lock);
751                 pnfs_free_lseg_list(&tmp_list);
752                 nfs_commit_inode(&nfsi->vfs_inode, 0);
753                 pnfs_put_layout_hdr(lo);
754         } else
755                 spin_unlock(&nfsi->vfs_inode.i_lock);
756 }
757 EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
758
759 static bool
760 pnfs_layout_add_bulk_destroy_list(struct inode *inode,
761                 struct list_head *layout_list)
762 {
763         struct pnfs_layout_hdr *lo;
764         bool ret = false;
765
766         spin_lock(&inode->i_lock);
767         lo = NFS_I(inode)->layout;
768         if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
769                 pnfs_get_layout_hdr(lo);
770                 list_add(&lo->plh_bulk_destroy, layout_list);
771                 ret = true;
772         }
773         spin_unlock(&inode->i_lock);
774         return ret;
775 }
776
777 /* Caller must hold rcu_read_lock and clp->cl_lock */
778 static int
779 pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
780                 struct nfs_server *server,
781                 struct list_head *layout_list)
782         __must_hold(&clp->cl_lock)
783         __must_hold(RCU)
784 {
785         struct pnfs_layout_hdr *lo, *next;
786         struct inode *inode;
787
788         list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
789                 if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
790                     test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) ||
791                     !list_empty(&lo->plh_bulk_destroy))
792                         continue;
793                 /* If the sb is being destroyed, just bail */
794                 if (!nfs_sb_active(server->super))
795                         break;
796                 inode = pnfs_grab_inode_layout_hdr(lo);
797                 if (inode != NULL) {
798                         if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags))
799                                 list_del_rcu(&lo->plh_layouts);
800                         if (pnfs_layout_add_bulk_destroy_list(inode,
801                                                 layout_list))
802                                 continue;
803                         rcu_read_unlock();
804                         spin_unlock(&clp->cl_lock);
805                         iput(inode);
806                 } else {
807                         rcu_read_unlock();
808                         spin_unlock(&clp->cl_lock);
809                 }
810                 nfs_sb_deactive(server->super);
811                 spin_lock(&clp->cl_lock);
812                 rcu_read_lock();
813                 return -EAGAIN;
814         }
815         return 0;
816 }
817
818 static int
819 pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
820                 bool is_bulk_recall)
821 {
822         struct pnfs_layout_hdr *lo;
823         struct inode *inode;
824         LIST_HEAD(lseg_list);
825         int ret = 0;
826
827         while (!list_empty(layout_list)) {
828                 lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
829                                 plh_bulk_destroy);
830                 dprintk("%s freeing layout for inode %lu\n", __func__,
831                         lo->plh_inode->i_ino);
832                 inode = lo->plh_inode;
833
834                 pnfs_layoutcommit_inode(inode, false);
835
836                 spin_lock(&inode->i_lock);
837                 list_del_init(&lo->plh_bulk_destroy);
838                 if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
839                         if (is_bulk_recall)
840                                 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
841                         ret = -EAGAIN;
842                 }
843                 spin_unlock(&inode->i_lock);
844                 pnfs_free_lseg_list(&lseg_list);
845                 /* Free all lsegs that are attached to commit buckets */
846                 nfs_commit_inode(inode, 0);
847                 pnfs_put_layout_hdr(lo);
848                 nfs_iput_and_deactive(inode);
849         }
850         return ret;
851 }
852
853 int
854 pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
855                 struct nfs_fsid *fsid,
856                 bool is_recall)
857 {
858         struct nfs_server *server;
859         LIST_HEAD(layout_list);
860
861         spin_lock(&clp->cl_lock);
862         rcu_read_lock();
863 restart:
864         list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
865                 if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
866                         continue;
867                 if (pnfs_layout_bulk_destroy_byserver_locked(clp,
868                                 server,
869                                 &layout_list) != 0)
870                         goto restart;
871         }
872         rcu_read_unlock();
873         spin_unlock(&clp->cl_lock);
874
875         if (list_empty(&layout_list))
876                 return 0;
877         return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
878 }
879
880 int
881 pnfs_destroy_layouts_byclid(struct nfs_client *clp,
882                 bool is_recall)
883 {
884         struct nfs_server *server;
885         LIST_HEAD(layout_list);
886
887         spin_lock(&clp->cl_lock);
888         rcu_read_lock();
889 restart:
890         list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
891                 if (pnfs_layout_bulk_destroy_byserver_locked(clp,
892                                         server,
893                                         &layout_list) != 0)
894                         goto restart;
895         }
896         rcu_read_unlock();
897         spin_unlock(&clp->cl_lock);
898
899         if (list_empty(&layout_list))
900                 return 0;
901         return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
902 }
903
904 /*
905  * Called by the state manger to remove all layouts established under an
906  * expired lease.
907  */
908 void
909 pnfs_destroy_all_layouts(struct nfs_client *clp)
910 {
911         nfs4_deviceid_mark_client_invalid(clp);
912         nfs4_deviceid_purge_client(clp);
913
914         pnfs_destroy_layouts_byclid(clp, false);
915 }
916
917 static void
918 pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred)
919 {
920         const struct cred *old;
921
922         if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) {
923                 old = xchg(&lo->plh_lc_cred, get_cred(cred));
924                 put_cred(old);
925         }
926 }
927
928 /* update lo->plh_stateid with new if is more recent */
929 void
930 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
931                         const struct cred *cred, bool update_barrier)
932 {
933         u32 oldseq, newseq, new_barrier = 0;
934
935         oldseq = be32_to_cpu(lo->plh_stateid.seqid);
936         newseq = be32_to_cpu(new->seqid);
937
938         if (!pnfs_layout_is_valid(lo)) {
939                 pnfs_set_layout_cred(lo, cred);
940                 nfs4_stateid_copy(&lo->plh_stateid, new);
941                 lo->plh_barrier = newseq;
942                 pnfs_clear_layoutreturn_info(lo);
943                 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
944                 return;
945         }
946         if (pnfs_seqid_is_newer(newseq, oldseq)) {
947                 nfs4_stateid_copy(&lo->plh_stateid, new);
948                 /*
949                  * Because of wraparound, we want to keep the barrier
950                  * "close" to the current seqids.
951                  */
952                 new_barrier = newseq - atomic_read(&lo->plh_outstanding);
953         }
954         if (update_barrier)
955                 new_barrier = be32_to_cpu(new->seqid);
956         else if (new_barrier == 0)
957                 return;
958         if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
959                 lo->plh_barrier = new_barrier;
960 }
961
962 static bool
963 pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
964                 const nfs4_stateid *stateid)
965 {
966         u32 seqid = be32_to_cpu(stateid->seqid);
967
968         return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
969 }
970
971 /* lget is set to 1 if called from inside send_layoutget call chain */
972 static bool
973 pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
974 {
975         return lo->plh_block_lgets ||
976                 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
977 }
978
979 static struct nfs_server *
980 pnfs_find_server(struct inode *inode, struct nfs_open_context *ctx)
981 {
982         struct nfs_server *server;
983
984         if (inode) {
985                 server = NFS_SERVER(inode);
986         } else {
987                 struct dentry *parent_dir = dget_parent(ctx->dentry);
988                 server = NFS_SERVER(parent_dir->d_inode);
989                 dput(parent_dir);
990         }
991         return server;
992 }
993
994 static void nfs4_free_pages(struct page **pages, size_t size)
995 {
996         int i;
997
998         if (!pages)
999                 return;
1000
1001         for (i = 0; i < size; i++) {
1002                 if (!pages[i])
1003                         break;
1004                 __free_page(pages[i]);
1005         }
1006         kfree(pages);
1007 }
1008
1009 static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
1010 {
1011         struct page **pages;
1012         int i;
1013
1014         pages = kmalloc_array(size, sizeof(struct page *), gfp_flags);
1015         if (!pages) {
1016                 dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
1017                 return NULL;
1018         }
1019
1020         for (i = 0; i < size; i++) {
1021                 pages[i] = alloc_page(gfp_flags);
1022                 if (!pages[i]) {
1023                         dprintk("%s: failed to allocate page\n", __func__);
1024                         nfs4_free_pages(pages, i);
1025                         return NULL;
1026                 }
1027         }
1028
1029         return pages;
1030 }
1031
1032 static struct nfs4_layoutget *
1033 pnfs_alloc_init_layoutget_args(struct inode *ino,
1034            struct nfs_open_context *ctx,
1035            const nfs4_stateid *stateid,
1036            const struct pnfs_layout_range *range,
1037            gfp_t gfp_flags)
1038 {
1039         struct nfs_server *server = pnfs_find_server(ino, ctx);
1040         size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response;
1041         size_t max_pages = max_response_pages(server);
1042         struct nfs4_layoutget *lgp;
1043
1044         dprintk("--> %s\n", __func__);
1045
1046         lgp = kzalloc(sizeof(*lgp), gfp_flags);
1047         if (lgp == NULL)
1048                 return NULL;
1049
1050         if (max_reply_sz) {
1051                 size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
1052                 if (npages < max_pages)
1053                         max_pages = npages;
1054         }
1055
1056         lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
1057         if (!lgp->args.layout.pages) {
1058                 kfree(lgp);
1059                 return NULL;
1060         }
1061         lgp->args.layout.pglen = max_pages * PAGE_SIZE;
1062         lgp->res.layoutp = &lgp->args.layout;
1063
1064         /* Don't confuse uninitialised result and success */
1065         lgp->res.status = -NFS4ERR_DELAY;
1066
1067         lgp->args.minlength = PAGE_SIZE;
1068         if (lgp->args.minlength > range->length)
1069                 lgp->args.minlength = range->length;
1070         if (ino) {
1071                 loff_t i_size = i_size_read(ino);
1072
1073                 if (range->iomode == IOMODE_READ) {
1074                         if (range->offset >= i_size)
1075                                 lgp->args.minlength = 0;
1076                         else if (i_size - range->offset < lgp->args.minlength)
1077                                 lgp->args.minlength = i_size - range->offset;
1078                 }
1079         }
1080         lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
1081         pnfs_copy_range(&lgp->args.range, range);
1082         lgp->args.type = server->pnfs_curr_ld->id;
1083         lgp->args.inode = ino;
1084         lgp->args.ctx = get_nfs_open_context(ctx);
1085         nfs4_stateid_copy(&lgp->args.stateid, stateid);
1086         lgp->gfp_flags = gfp_flags;
1087         lgp->cred = ctx->cred;
1088         return lgp;
1089 }
1090
1091 void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
1092 {
1093         size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE;
1094
1095         nfs4_free_pages(lgp->args.layout.pages, max_pages);
1096         if (lgp->args.inode)
1097                 pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
1098         put_nfs_open_context(lgp->args.ctx);
1099         kfree(lgp);
1100 }
1101
1102 static void pnfs_clear_layoutcommit(struct inode *inode,
1103                 struct list_head *head)
1104 {
1105         struct nfs_inode *nfsi = NFS_I(inode);
1106         struct pnfs_layout_segment *lseg, *tmp;
1107
1108         if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1109                 return;
1110         list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
1111                 if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
1112                         continue;
1113                 pnfs_lseg_dec_and_remove_zero(lseg, head);
1114         }
1115 }
1116
1117 void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
1118                 const nfs4_stateid *arg_stateid,
1119                 const struct pnfs_layout_range *range,
1120                 const nfs4_stateid *stateid)
1121 {
1122         struct inode *inode = lo->plh_inode;
1123         LIST_HEAD(freeme);
1124
1125         spin_lock(&inode->i_lock);
1126         if (!pnfs_layout_is_valid(lo) || !arg_stateid ||
1127             !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
1128                 goto out_unlock;
1129         if (stateid) {
1130                 u32 seq = be32_to_cpu(arg_stateid->seqid);
1131
1132                 pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
1133                 pnfs_free_returned_lsegs(lo, &freeme, range, seq);
1134                 pnfs_set_layout_stateid(lo, stateid, NULL, true);
1135         } else
1136                 pnfs_mark_layout_stateid_invalid(lo, &freeme);
1137 out_unlock:
1138         pnfs_clear_layoutreturn_waitbit(lo);
1139         spin_unlock(&inode->i_lock);
1140         pnfs_free_lseg_list(&freeme);
1141
1142 }
1143
1144 static bool
1145 pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
1146                 nfs4_stateid *stateid,
1147                 const struct cred **cred,
1148                 enum pnfs_iomode *iomode)
1149 {
1150         /* Serialise LAYOUTGET/LAYOUTRETURN */
1151         if (atomic_read(&lo->plh_outstanding) != 0)
1152                 return false;
1153         if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
1154                 return false;
1155         set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
1156         pnfs_get_layout_hdr(lo);
1157         if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
1158                 nfs4_stateid_copy(stateid, &lo->plh_stateid);
1159                 *cred = get_cred(lo->plh_lc_cred);
1160                 if (lo->plh_return_seq != 0)
1161                         stateid->seqid = cpu_to_be32(lo->plh_return_seq);
1162                 if (iomode != NULL)
1163                         *iomode = lo->plh_return_iomode;
1164                 pnfs_clear_layoutreturn_info(lo);
1165                 return true;
1166         }
1167         nfs4_stateid_copy(stateid, &lo->plh_stateid);
1168         *cred = get_cred(lo->plh_lc_cred);
1169         if (iomode != NULL)
1170                 *iomode = IOMODE_ANY;
1171         return true;
1172 }
1173
1174 static void
1175 pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
1176                 struct pnfs_layout_hdr *lo,
1177                 const nfs4_stateid *stateid,
1178                 enum pnfs_iomode iomode)
1179 {
1180         struct inode *inode = lo->plh_inode;
1181
1182         args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id;
1183         args->inode = inode;
1184         args->range.iomode = iomode;
1185         args->range.offset = 0;
1186         args->range.length = NFS4_MAX_UINT64;
1187         args->layout = lo;
1188         nfs4_stateid_copy(&args->stateid, stateid);
1189 }
1190
1191 static int
1192 pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
1193                        const nfs4_stateid *stateid,
1194                        const struct cred **pcred,
1195                        enum pnfs_iomode iomode,
1196                        bool sync)
1197 {
1198         struct inode *ino = lo->plh_inode;
1199         struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
1200         struct nfs4_layoutreturn *lrp;
1201         const struct cred *cred = *pcred;
1202         int status = 0;
1203
1204         *pcred = NULL;
1205         lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
1206         if (unlikely(lrp == NULL)) {
1207                 status = -ENOMEM;
1208                 spin_lock(&ino->i_lock);
1209                 pnfs_clear_layoutreturn_waitbit(lo);
1210                 spin_unlock(&ino->i_lock);
1211                 put_cred(cred);
1212                 pnfs_put_layout_hdr(lo);
1213                 goto out;
1214         }
1215
1216         pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
1217         lrp->args.ld_private = &lrp->ld_private;
1218         lrp->clp = NFS_SERVER(ino)->nfs_client;
1219         lrp->cred = cred;
1220         if (ld->prepare_layoutreturn)
1221                 ld->prepare_layoutreturn(&lrp->args);
1222
1223         status = nfs4_proc_layoutreturn(lrp, sync);
1224 out:
1225         dprintk("<-- %s status: %d\n", __func__, status);
1226         return status;
1227 }
1228
1229 /* Return true if layoutreturn is needed */
1230 static bool
1231 pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
1232 {
1233         struct pnfs_layout_segment *s;
1234         enum pnfs_iomode iomode;
1235         u32 seq;
1236
1237         if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1238                 return false;
1239
1240         seq = lo->plh_return_seq;
1241         iomode = lo->plh_return_iomode;
1242
1243         /* Defer layoutreturn until all recalled lsegs are done */
1244         list_for_each_entry(s, &lo->plh_segs, pls_list) {
1245                 if (seq && pnfs_seqid_is_newer(s->pls_seq, seq))
1246                         continue;
1247                 if (iomode != IOMODE_ANY && s->pls_range.iomode != iomode)
1248                         continue;
1249                 if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
1250                         return false;
1251         }
1252
1253         return true;
1254 }
1255
1256 static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
1257 {
1258         struct inode *inode= lo->plh_inode;
1259
1260         if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1261                 return;
1262         spin_lock(&inode->i_lock);
1263         if (pnfs_layout_need_return(lo)) {
1264                 const struct cred *cred;
1265                 nfs4_stateid stateid;
1266                 enum pnfs_iomode iomode;
1267                 bool send;
1268
1269                 send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
1270                 spin_unlock(&inode->i_lock);
1271                 if (send) {
1272                         /* Send an async layoutreturn so we dont deadlock */
1273                         pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
1274                 }
1275         } else
1276                 spin_unlock(&inode->i_lock);
1277 }
1278
1279 /*
1280  * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
1281  * when the layout segment list is empty.
1282  *
1283  * Note that a pnfs_layout_hdr can exist with an empty layout segment
1284  * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
1285  * deviceid is marked invalid.
1286  */
1287 int
1288 _pnfs_return_layout(struct inode *ino)
1289 {
1290         struct pnfs_layout_hdr *lo = NULL;
1291         struct nfs_inode *nfsi = NFS_I(ino);
1292         LIST_HEAD(tmp_list);
1293         const struct cred *cred;
1294         nfs4_stateid stateid;
1295         int status = 0;
1296         bool send, valid_layout;
1297
1298         dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
1299
1300         spin_lock(&ino->i_lock);
1301         lo = nfsi->layout;
1302         if (!lo) {
1303                 spin_unlock(&ino->i_lock);
1304                 dprintk("NFS: %s no layout to return\n", __func__);
1305                 goto out;
1306         }
1307         /* Reference matched in nfs4_layoutreturn_release */
1308         pnfs_get_layout_hdr(lo);
1309         /* Is there an outstanding layoutreturn ? */
1310         if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
1311                 spin_unlock(&ino->i_lock);
1312                 if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
1313                                         TASK_UNINTERRUPTIBLE))
1314                         goto out_put_layout_hdr;
1315                 spin_lock(&ino->i_lock);
1316         }
1317         valid_layout = pnfs_layout_is_valid(lo);
1318         pnfs_clear_layoutcommit(ino, &tmp_list);
1319         pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
1320
1321         if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
1322                 struct pnfs_layout_range range = {
1323                         .iomode         = IOMODE_ANY,
1324                         .offset         = 0,
1325                         .length         = NFS4_MAX_UINT64,
1326                 };
1327                 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
1328         }
1329
1330         /* Don't send a LAYOUTRETURN if list was initially empty */
1331         if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) ||
1332                         !valid_layout) {
1333                 spin_unlock(&ino->i_lock);
1334                 dprintk("NFS: %s no layout segments to return\n", __func__);
1335                 goto out_wait_layoutreturn;
1336         }
1337
1338         send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
1339         spin_unlock(&ino->i_lock);
1340         if (send)
1341                 status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
1342 out_wait_layoutreturn:
1343         wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE);
1344 out_put_layout_hdr:
1345         pnfs_free_lseg_list(&tmp_list);
1346         pnfs_put_layout_hdr(lo);
1347 out:
1348         dprintk("<-- %s status: %d\n", __func__, status);
1349         return status;
1350 }
1351
1352 int
1353 pnfs_commit_and_return_layout(struct inode *inode)
1354 {
1355         struct pnfs_layout_hdr *lo;
1356         int ret;
1357
1358         spin_lock(&inode->i_lock);
1359         lo = NFS_I(inode)->layout;
1360         if (lo == NULL) {
1361                 spin_unlock(&inode->i_lock);
1362                 return 0;
1363         }
1364         pnfs_get_layout_hdr(lo);
1365         /* Block new layoutgets and read/write to ds */
1366         lo->plh_block_lgets++;
1367         spin_unlock(&inode->i_lock);
1368         filemap_fdatawait(inode->i_mapping);
1369         ret = pnfs_layoutcommit_inode(inode, true);
1370         if (ret == 0)
1371                 ret = _pnfs_return_layout(inode);
1372         spin_lock(&inode->i_lock);
1373         lo->plh_block_lgets--;
1374         spin_unlock(&inode->i_lock);
1375         pnfs_put_layout_hdr(lo);
1376         return ret;
1377 }
1378
1379 bool pnfs_roc(struct inode *ino,
1380                 struct nfs4_layoutreturn_args *args,
1381                 struct nfs4_layoutreturn_res *res,
1382                 const struct cred *cred)
1383 {
1384         struct nfs_inode *nfsi = NFS_I(ino);
1385         struct nfs_open_context *ctx;
1386         struct nfs4_state *state;
1387         struct pnfs_layout_hdr *lo;
1388         struct pnfs_layout_segment *lseg, *next;
1389         const struct cred *lc_cred;
1390         nfs4_stateid stateid;
1391         enum pnfs_iomode iomode = 0;
1392         bool layoutreturn = false, roc = false;
1393         bool skip_read = false;
1394
1395         if (!nfs_have_layout(ino))
1396                 return false;
1397 retry:
1398         rcu_read_lock();
1399         spin_lock(&ino->i_lock);
1400         lo = nfsi->layout;
1401         if (!lo || !pnfs_layout_is_valid(lo) ||
1402             test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1403                 lo = NULL;
1404                 goto out_noroc;
1405         }
1406         pnfs_get_layout_hdr(lo);
1407         if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
1408                 spin_unlock(&ino->i_lock);
1409                 rcu_read_unlock();
1410                 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
1411                                 TASK_UNINTERRUPTIBLE);
1412                 pnfs_put_layout_hdr(lo);
1413                 goto retry;
1414         }
1415
1416         /* no roc if we hold a delegation */
1417         if (nfs4_check_delegation(ino, FMODE_READ)) {
1418                 if (nfs4_check_delegation(ino, FMODE_WRITE))
1419                         goto out_noroc;
1420                 skip_read = true;
1421         }
1422
1423         list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
1424                 state = ctx->state;
1425                 if (state == NULL)
1426                         continue;
1427                 /* Don't return layout if there is open file state */
1428                 if (state->state & FMODE_WRITE)
1429                         goto out_noroc;
1430                 if (state->state & FMODE_READ)
1431                         skip_read = true;
1432         }
1433
1434
1435         list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
1436                 if (skip_read && lseg->pls_range.iomode == IOMODE_READ)
1437                         continue;
1438                 /* If we are sending layoutreturn, invalidate all valid lsegs */
1439                 if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags))
1440                         continue;
1441                 /*
1442                  * Note: mark lseg for return so pnfs_layout_remove_lseg
1443                  * doesn't invalidate the layout for us.
1444                  */
1445                 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1446                 if (!mark_lseg_invalid(lseg, &lo->plh_return_segs))
1447                         continue;
1448                 pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
1449         }
1450
1451         if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1452                 goto out_noroc;
1453
1454         /* ROC in two conditions:
1455          * 1. there are ROC lsegs
1456          * 2. we don't send layoutreturn
1457          */
1458         /* lo ref dropped in pnfs_roc_release() */
1459         layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode);
1460         /* If the creds don't match, we can't compound the layoutreturn */
1461         if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0)
1462                 goto out_noroc;
1463
1464         roc = layoutreturn;
1465         pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
1466         res->lrs_present = 0;
1467         layoutreturn = false;
1468         put_cred(lc_cred);
1469
1470 out_noroc:
1471         spin_unlock(&ino->i_lock);
1472         rcu_read_unlock();
1473         pnfs_layoutcommit_inode(ino, true);
1474         if (roc) {
1475                 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
1476                 if (ld->prepare_layoutreturn)
1477                         ld->prepare_layoutreturn(args);
1478                 pnfs_put_layout_hdr(lo);
1479                 return true;
1480         }
1481         if (layoutreturn)
1482                 pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true);
1483         pnfs_put_layout_hdr(lo);
1484         return false;
1485 }
1486
1487 int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
1488                 struct nfs4_layoutreturn_args **argpp,
1489                 struct nfs4_layoutreturn_res **respp,
1490                 int *ret)
1491 {
1492         struct nfs4_layoutreturn_args *arg = *argpp;
1493         int retval = -EAGAIN;
1494
1495         if (!arg)
1496                 return 0;
1497         /* Handle Layoutreturn errors */
1498         switch (*ret) {
1499         case 0:
1500                 retval = 0;
1501                 break;
1502         case -NFS4ERR_NOMATCHING_LAYOUT:
1503                 /* Was there an RPC level error? If not, retry */
1504                 if (task->tk_rpc_status == 0)
1505                         break;
1506                 /* If the call was not sent, let caller handle it */
1507                 if (!RPC_WAS_SENT(task))
1508                         return 0;
1509                 /*
1510                  * Otherwise, assume the call succeeded and
1511                  * that we need to release the layout
1512                  */
1513                 *ret = 0;
1514                 (*respp)->lrs_present = 0;
1515                 retval = 0;
1516                 break;
1517         case -NFS4ERR_DELAY:
1518                 /* Let the caller handle the retry */
1519                 *ret = -NFS4ERR_NOMATCHING_LAYOUT;
1520                 return 0;
1521         case -NFS4ERR_OLD_STATEID:
1522                 if (!nfs4_layout_refresh_old_stateid(&arg->stateid,
1523                                         &arg->range, inode))
1524                         break;
1525                 *ret = -NFS4ERR_NOMATCHING_LAYOUT;
1526                 return -EAGAIN;
1527         }
1528         *argpp = NULL;
1529         *respp = NULL;
1530         return retval;
1531 }
1532
1533 void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
1534                 struct nfs4_layoutreturn_res *res,
1535                 int ret)
1536 {
1537         struct pnfs_layout_hdr *lo = args->layout;
1538         const nfs4_stateid *arg_stateid = NULL;
1539         const nfs4_stateid *res_stateid = NULL;
1540         struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
1541
1542         switch (ret) {
1543         case -NFS4ERR_NOMATCHING_LAYOUT:
1544                 break;
1545         case 0:
1546                 if (res->lrs_present)
1547                         res_stateid = &res->stateid;
1548                 /* Fallthrough */
1549         default:
1550                 arg_stateid = &args->stateid;
1551         }
1552         pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
1553                         res_stateid);
1554         if (ld_private && ld_private->ops && ld_private->ops->free)
1555                 ld_private->ops->free(ld_private);
1556         pnfs_put_layout_hdr(lo);
1557         trace_nfs4_layoutreturn_on_close(args->inode, 0);
1558 }
1559
1560 bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
1561 {
1562         struct nfs_inode *nfsi = NFS_I(ino);
1563         struct pnfs_layout_hdr *lo;
1564         bool sleep = false;
1565
1566         /* we might not have grabbed lo reference. so need to check under
1567          * i_lock */
1568         spin_lock(&ino->i_lock);
1569         lo = nfsi->layout;
1570         if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1571                 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1572                 sleep = true;
1573         }
1574         spin_unlock(&ino->i_lock);
1575         return sleep;
1576 }
1577
1578 /*
1579  * Compare two layout segments for sorting into layout cache.
1580  * We want to preferentially return RW over RO layouts, so ensure those
1581  * are seen first.
1582  */
1583 static s64
1584 pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
1585            const struct pnfs_layout_range *l2)
1586 {
1587         s64 d;
1588
1589         /* high offset > low offset */
1590         d = l1->offset - l2->offset;
1591         if (d)
1592                 return d;
1593
1594         /* short length > long length */
1595         d = l2->length - l1->length;
1596         if (d)
1597                 return d;
1598
1599         /* read > read/write */
1600         return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
1601 }
1602
1603 static bool
1604 pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
1605                 const struct pnfs_layout_range *l2)
1606 {
1607         return pnfs_lseg_range_cmp(l1, l2) > 0;
1608 }
1609
1610 static bool
1611 pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
1612                 struct pnfs_layout_segment *old)
1613 {
1614         return false;
1615 }
1616
1617 void
1618 pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1619                    struct pnfs_layout_segment *lseg,
1620                    bool (*is_after)(const struct pnfs_layout_range *,
1621                            const struct pnfs_layout_range *),
1622                    bool (*do_merge)(struct pnfs_layout_segment *,
1623                            struct pnfs_layout_segment *),
1624                    struct list_head *free_me)
1625 {
1626         struct pnfs_layout_segment *lp, *tmp;
1627
1628         dprintk("%s:Begin\n", __func__);
1629
1630         list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
1631                 if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
1632                         continue;
1633                 if (do_merge(lseg, lp)) {
1634                         mark_lseg_invalid(lp, free_me);
1635                         continue;
1636                 }
1637                 if (is_after(&lseg->pls_range, &lp->pls_range))
1638                         continue;
1639                 list_add_tail(&lseg->pls_list, &lp->pls_list);
1640                 dprintk("%s: inserted lseg %p "
1641                         "iomode %d offset %llu length %llu before "
1642                         "lp %p iomode %d offset %llu length %llu\n",
1643                         __func__, lseg, lseg->pls_range.iomode,
1644                         lseg->pls_range.offset, lseg->pls_range.length,
1645                         lp, lp->pls_range.iomode, lp->pls_range.offset,
1646                         lp->pls_range.length);
1647                 goto out;
1648         }
1649         list_add_tail(&lseg->pls_list, &lo->plh_segs);
1650         dprintk("%s: inserted lseg %p "
1651                 "iomode %d offset %llu length %llu at tail\n",
1652                 __func__, lseg, lseg->pls_range.iomode,
1653                 lseg->pls_range.offset, lseg->pls_range.length);
1654 out:
1655         pnfs_get_layout_hdr(lo);
1656
1657         dprintk("%s:Return\n", __func__);
1658 }
1659 EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
1660
1661 static void
1662 pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1663                    struct pnfs_layout_segment *lseg,
1664                    struct list_head *free_me)
1665 {
1666         struct inode *inode = lo->plh_inode;
1667         struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1668
1669         if (ld->add_lseg != NULL)
1670                 ld->add_lseg(lo, lseg, free_me);
1671         else
1672                 pnfs_generic_layout_insert_lseg(lo, lseg,
1673                                 pnfs_lseg_range_is_after,
1674                                 pnfs_lseg_no_merge,
1675                                 free_me);
1676 }
1677
1678 static struct pnfs_layout_hdr *
1679 alloc_init_layout_hdr(struct inode *ino,
1680                       struct nfs_open_context *ctx,
1681                       gfp_t gfp_flags)
1682 {
1683         struct pnfs_layout_hdr *lo;
1684
1685         lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
1686         if (!lo)
1687                 return NULL;
1688         refcount_set(&lo->plh_refcount, 1);
1689         INIT_LIST_HEAD(&lo->plh_layouts);
1690         INIT_LIST_HEAD(&lo->plh_segs);
1691         INIT_LIST_HEAD(&lo->plh_return_segs);
1692         INIT_LIST_HEAD(&lo->plh_bulk_destroy);
1693         lo->plh_inode = ino;
1694         lo->plh_lc_cred = get_cred(ctx->cred);
1695         lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
1696         return lo;
1697 }
1698
1699 static struct pnfs_layout_hdr *
1700 pnfs_find_alloc_layout(struct inode *ino,
1701                        struct nfs_open_context *ctx,
1702                        gfp_t gfp_flags)
1703         __releases(&ino->i_lock)
1704         __acquires(&ino->i_lock)
1705 {
1706         struct nfs_inode *nfsi = NFS_I(ino);
1707         struct pnfs_layout_hdr *new = NULL;
1708
1709         dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
1710
1711         if (nfsi->layout != NULL)
1712                 goto out_existing;
1713         spin_unlock(&ino->i_lock);
1714         new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
1715         spin_lock(&ino->i_lock);
1716
1717         if (likely(nfsi->layout == NULL)) {     /* Won the race? */
1718                 nfsi->layout = new;
1719                 return new;
1720         } else if (new != NULL)
1721                 pnfs_free_layout_hdr(new);
1722 out_existing:
1723         pnfs_get_layout_hdr(nfsi->layout);
1724         return nfsi->layout;
1725 }
1726
1727 /*
1728  * iomode matching rules:
1729  * iomode       lseg    strict match
1730  *                      iomode
1731  * -----        -----   ------ -----
1732  * ANY          READ    N/A    true
1733  * ANY          RW      N/A    true
1734  * RW           READ    N/A    false
1735  * RW           RW      N/A    true
1736  * READ         READ    N/A    true
1737  * READ         RW      true   false
1738  * READ         RW      false  true
1739  */
1740 static bool
1741 pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1742                  const struct pnfs_layout_range *range,
1743                  bool strict_iomode)
1744 {
1745         struct pnfs_layout_range range1;
1746
1747         if ((range->iomode == IOMODE_RW &&
1748              ls_range->iomode != IOMODE_RW) ||
1749             (range->iomode != ls_range->iomode &&
1750              strict_iomode) ||
1751             !pnfs_lseg_range_intersecting(ls_range, range))
1752                 return false;
1753
1754         /* range1 covers only the first byte in the range */
1755         range1 = *range;
1756         range1.length = 1;
1757         return pnfs_lseg_range_contained(ls_range, &range1);
1758 }
1759
1760 /*
1761  * lookup range in layout
1762  */
1763 static struct pnfs_layout_segment *
1764 pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1765                 struct pnfs_layout_range *range,
1766                 bool strict_iomode)
1767 {
1768         struct pnfs_layout_segment *lseg, *ret = NULL;
1769
1770         dprintk("%s:Begin\n", __func__);
1771
1772         list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1773                 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1774                     pnfs_lseg_range_match(&lseg->pls_range, range,
1775                                           strict_iomode)) {
1776                         ret = pnfs_get_lseg(lseg);
1777                         break;
1778                 }
1779         }
1780
1781         dprintk("%s:Return lseg %p ref %d\n",
1782                 __func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0);
1783         return ret;
1784 }
1785
1786 /*
1787  * Use mdsthreshold hints set at each OPEN to determine if I/O should go
1788  * to the MDS or over pNFS
1789  *
1790  * The nfs_inode read_io and write_io fields are cumulative counters reset
1791  * when there are no layout segments. Note that in pnfs_update_layout iomode
1792  * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
1793  * WRITE request.
1794  *
1795  * A return of true means use MDS I/O.
1796  *
1797  * From rfc 5661:
1798  * If a file's size is smaller than the file size threshold, data accesses
1799  * SHOULD be sent to the metadata server.  If an I/O request has a length that
1800  * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
1801  * server.  If both file size and I/O size are provided, the client SHOULD
1802  * reach or exceed  both thresholds before sending its read or write
1803  * requests to the data server.
1804  */
1805 static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1806                                      struct inode *ino, int iomode)
1807 {
1808         struct nfs4_threshold *t = ctx->mdsthreshold;
1809         struct nfs_inode *nfsi = NFS_I(ino);
1810         loff_t fsize = i_size_read(ino);
1811         bool size = false, size_set = false, io = false, io_set = false, ret = false;
1812
1813         if (t == NULL)
1814                 return ret;
1815
1816         dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
1817                 __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
1818
1819         switch (iomode) {
1820         case IOMODE_READ:
1821                 if (t->bm & THRESHOLD_RD) {
1822                         dprintk("%s fsize %llu\n", __func__, fsize);
1823                         size_set = true;
1824                         if (fsize < t->rd_sz)
1825                                 size = true;
1826                 }
1827                 if (t->bm & THRESHOLD_RD_IO) {
1828                         dprintk("%s nfsi->read_io %llu\n", __func__,
1829                                 nfsi->read_io);
1830                         io_set = true;
1831                         if (nfsi->read_io < t->rd_io_sz)
1832                                 io = true;
1833                 }
1834                 break;
1835         case IOMODE_RW:
1836                 if (t->bm & THRESHOLD_WR) {
1837                         dprintk("%s fsize %llu\n", __func__, fsize);
1838                         size_set = true;
1839                         if (fsize < t->wr_sz)
1840                                 size = true;
1841                 }
1842                 if (t->bm & THRESHOLD_WR_IO) {
1843                         dprintk("%s nfsi->write_io %llu\n", __func__,
1844                                 nfsi->write_io);
1845                         io_set = true;
1846                         if (nfsi->write_io < t->wr_io_sz)
1847                                 io = true;
1848                 }
1849                 break;
1850         }
1851         if (size_set && io_set) {
1852                 if (size && io)
1853                         ret = true;
1854         } else if (size || io)
1855                 ret = true;
1856
1857         dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
1858         return ret;
1859 }
1860
1861 static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1862 {
1863         /*
1864          * send layoutcommit as it can hold up layoutreturn due to lseg
1865          * reference
1866          */
1867         pnfs_layoutcommit_inode(lo->plh_inode, false);
1868         return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1869                                    nfs_wait_bit_killable,
1870                                    TASK_KILLABLE);
1871 }
1872
1873 static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
1874 {
1875         atomic_inc(&lo->plh_outstanding);
1876 }
1877
1878 static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
1879 {
1880         if (atomic_dec_and_test(&lo->plh_outstanding))
1881                 wake_up_var(&lo->plh_outstanding);
1882 }
1883
1884 static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1885 {
1886         unsigned long *bitlock = &lo->plh_flags;
1887
1888         clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1889         smp_mb__after_atomic();
1890         wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1891 }
1892
1893 static void _add_to_server_list(struct pnfs_layout_hdr *lo,
1894                                 struct nfs_server *server)
1895 {
1896         if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
1897                 struct nfs_client *clp = server->nfs_client;
1898
1899                 /* The lo must be on the clp list if there is any
1900                  * chance of a CB_LAYOUTRECALL(FILE) coming in.
1901                  */
1902                 spin_lock(&clp->cl_lock);
1903                 list_add_tail_rcu(&lo->plh_layouts, &server->layouts);
1904                 spin_unlock(&clp->cl_lock);
1905         }
1906 }
1907
1908 /*
1909  * Layout segment is retreived from the server if not cached.
1910  * The appropriate layout segment is referenced and returned to the caller.
1911  */
1912 struct pnfs_layout_segment *
1913 pnfs_update_layout(struct inode *ino,
1914                    struct nfs_open_context *ctx,
1915                    loff_t pos,
1916                    u64 count,
1917                    enum pnfs_iomode iomode,
1918                    bool strict_iomode,
1919                    gfp_t gfp_flags)
1920 {
1921         struct pnfs_layout_range arg = {
1922                 .iomode = iomode,
1923                 .offset = pos,
1924                 .length = count,
1925         };
1926         unsigned pg_offset;
1927         struct nfs_server *server = NFS_SERVER(ino);
1928         struct nfs_client *clp = server->nfs_client;
1929         struct pnfs_layout_hdr *lo = NULL;
1930         struct pnfs_layout_segment *lseg = NULL;
1931         struct nfs4_layoutget *lgp;
1932         nfs4_stateid stateid;
1933         long timeout = 0;
1934         unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
1935         bool first;
1936
1937         if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
1938                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1939                                  PNFS_UPDATE_LAYOUT_NO_PNFS);
1940                 goto out;
1941         }
1942
1943         if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
1944                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1945                                  PNFS_UPDATE_LAYOUT_MDSTHRESH);
1946                 goto out;
1947         }
1948
1949 lookup_again:
1950         lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
1951         if (IS_ERR(lseg))
1952                 goto out;
1953         first = false;
1954         spin_lock(&ino->i_lock);
1955         lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1956         if (lo == NULL) {
1957                 spin_unlock(&ino->i_lock);
1958                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1959                                  PNFS_UPDATE_LAYOUT_NOMEM);
1960                 goto out;
1961         }
1962
1963         /* Do we even need to bother with this? */
1964         if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1965                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1966                                  PNFS_UPDATE_LAYOUT_BULK_RECALL);
1967                 dprintk("%s matches recall, use MDS\n", __func__);
1968                 goto out_unlock;
1969         }
1970
1971         /* if LAYOUTGET already failed once we don't try again */
1972         if (pnfs_layout_io_test_failed(lo, iomode)) {
1973                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1974                                  PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
1975                 goto out_unlock;
1976         }
1977
1978         /*
1979          * If the layout segment list is empty, but there are outstanding
1980          * layoutget calls, then they might be subject to a layoutrecall.
1981          */
1982         if (list_empty(&lo->plh_segs) &&
1983             atomic_read(&lo->plh_outstanding) != 0) {
1984                 spin_unlock(&ino->i_lock);
1985                 lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
1986                                         !atomic_read(&lo->plh_outstanding)));
1987                 if (IS_ERR(lseg))
1988                         goto out_put_layout_hdr;
1989                 pnfs_put_layout_hdr(lo);
1990                 goto lookup_again;
1991         }
1992
1993         lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
1994         if (lseg) {
1995                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1996                                 PNFS_UPDATE_LAYOUT_FOUND_CACHED);
1997                 goto out_unlock;
1998         }
1999
2000         /*
2001          * Choose a stateid for the LAYOUTGET. If we don't have a layout
2002          * stateid, or it has been invalidated, then we must use the open
2003          * stateid.
2004          */
2005         if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
2006                 int status;
2007
2008                 /*
2009                  * The first layoutget for the file. Need to serialize per
2010                  * RFC 5661 Errata 3208.
2011                  */
2012                 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
2013                                      &lo->plh_flags)) {
2014                         spin_unlock(&ino->i_lock);
2015                         lseg = ERR_PTR(wait_on_bit(&lo->plh_flags,
2016                                                 NFS_LAYOUT_FIRST_LAYOUTGET,
2017                                                 TASK_KILLABLE));
2018                         if (IS_ERR(lseg))
2019                                 goto out_put_layout_hdr;
2020                         pnfs_put_layout_hdr(lo);
2021                         dprintk("%s retrying\n", __func__);
2022                         goto lookup_again;
2023                 }
2024
2025                 spin_unlock(&ino->i_lock);
2026                 first = true;
2027                 status = nfs4_select_rw_stateid(ctx->state,
2028                                         iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
2029                                         NULL, &stateid, NULL);
2030                 if (status != 0) {
2031                         lseg = ERR_PTR(status);
2032                         trace_pnfs_update_layout(ino, pos, count,
2033                                         iomode, lo, lseg,
2034                                         PNFS_UPDATE_LAYOUT_INVALID_OPEN);
2035                         nfs4_schedule_stateid_recovery(server, ctx->state);
2036                         pnfs_clear_first_layoutget(lo);
2037                         pnfs_put_layout_hdr(lo);
2038                         goto lookup_again;
2039                 }
2040                 spin_lock(&ino->i_lock);
2041         } else {
2042                 nfs4_stateid_copy(&stateid, &lo->plh_stateid);
2043         }
2044
2045         /*
2046          * Because we free lsegs before sending LAYOUTRETURN, we need to wait
2047          * for LAYOUTRETURN even if first is true.
2048          */
2049         if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
2050                 spin_unlock(&ino->i_lock);
2051                 dprintk("%s wait for layoutreturn\n", __func__);
2052                 lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo));
2053                 if (!IS_ERR(lseg)) {
2054                         if (first)
2055                                 pnfs_clear_first_layoutget(lo);
2056                         pnfs_put_layout_hdr(lo);
2057                         dprintk("%s retrying\n", __func__);
2058                         trace_pnfs_update_layout(ino, pos, count, iomode, lo,
2059                                         lseg, PNFS_UPDATE_LAYOUT_RETRY);
2060                         goto lookup_again;
2061                 }
2062                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2063                                 PNFS_UPDATE_LAYOUT_RETURN);
2064                 goto out_put_layout_hdr;
2065         }
2066
2067         if (pnfs_layoutgets_blocked(lo)) {
2068                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2069                                 PNFS_UPDATE_LAYOUT_BLOCKED);
2070                 goto out_unlock;
2071         }
2072         nfs_layoutget_begin(lo);
2073         spin_unlock(&ino->i_lock);
2074
2075         _add_to_server_list(lo, server);
2076
2077         pg_offset = arg.offset & ~PAGE_MASK;
2078         if (pg_offset) {
2079                 arg.offset -= pg_offset;
2080                 arg.length += pg_offset;
2081         }
2082         if (arg.length != NFS4_MAX_UINT64)
2083                 arg.length = PAGE_ALIGN(arg.length);
2084
2085         lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags);
2086         if (!lgp) {
2087                 trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL,
2088                                          PNFS_UPDATE_LAYOUT_NOMEM);
2089                 nfs_layoutget_end(lo);
2090                 goto out_put_layout_hdr;
2091         }
2092
2093         lseg = nfs4_proc_layoutget(lgp, &timeout);
2094         trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2095                                  PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
2096         nfs_layoutget_end(lo);
2097         if (IS_ERR(lseg)) {
2098                 switch(PTR_ERR(lseg)) {
2099                 case -EBUSY:
2100                         if (time_after(jiffies, giveup))
2101                                 lseg = NULL;
2102                         break;
2103                 case -ERECALLCONFLICT:
2104                 case -EAGAIN:
2105                         break;
2106                 default:
2107                         if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
2108                                 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
2109                                 lseg = NULL;
2110                         }
2111                         goto out_put_layout_hdr;
2112                 }
2113                 if (lseg) {
2114                         if (first)
2115                                 pnfs_clear_first_layoutget(lo);
2116                         trace_pnfs_update_layout(ino, pos, count,
2117                                 iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
2118                         pnfs_put_layout_hdr(lo);
2119                         goto lookup_again;
2120                 }
2121         } else {
2122                 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
2123         }
2124
2125 out_put_layout_hdr:
2126         if (first)
2127                 pnfs_clear_first_layoutget(lo);
2128         trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2129                                  PNFS_UPDATE_LAYOUT_EXIT);
2130         pnfs_put_layout_hdr(lo);
2131 out:
2132         dprintk("%s: inode %s/%llu pNFS layout segment %s for "
2133                         "(%s, offset: %llu, length: %llu)\n",
2134                         __func__, ino->i_sb->s_id,
2135                         (unsigned long long)NFS_FILEID(ino),
2136                         IS_ERR_OR_NULL(lseg) ? "not found" : "found",
2137                         iomode==IOMODE_RW ?  "read/write" : "read-only",
2138                         (unsigned long long)pos,
2139                         (unsigned long long)count);
2140         return lseg;
2141 out_unlock:
2142         spin_unlock(&ino->i_lock);
2143         goto out_put_layout_hdr;
2144 }
2145 EXPORT_SYMBOL_GPL(pnfs_update_layout);
2146
2147 static bool
2148 pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
2149 {
2150         switch (range->iomode) {
2151         case IOMODE_READ:
2152         case IOMODE_RW:
2153                 break;
2154         default:
2155                 return false;
2156         }
2157         if (range->offset == NFS4_MAX_UINT64)
2158                 return false;
2159         if (range->length == 0)
2160                 return false;
2161         if (range->length != NFS4_MAX_UINT64 &&
2162             range->length > NFS4_MAX_UINT64 - range->offset)
2163                 return false;
2164         return true;
2165 }
2166
2167 static struct pnfs_layout_hdr *
2168 _pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx)
2169 {
2170         struct pnfs_layout_hdr *lo;
2171
2172         spin_lock(&ino->i_lock);
2173         lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL);
2174         if (!lo)
2175                 goto out_unlock;
2176         if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
2177                 goto out_unlock;
2178         if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
2179                 goto out_unlock;
2180         if (pnfs_layoutgets_blocked(lo))
2181                 goto out_unlock;
2182         if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags))
2183                 goto out_unlock;
2184         nfs_layoutget_begin(lo);
2185         spin_unlock(&ino->i_lock);
2186         _add_to_server_list(lo, NFS_SERVER(ino));
2187         return lo;
2188
2189 out_unlock:
2190         spin_unlock(&ino->i_lock);
2191         pnfs_put_layout_hdr(lo);
2192         return NULL;
2193 }
2194
2195 static void _lgopen_prepare_attached(struct nfs4_opendata *data,
2196                                      struct nfs_open_context *ctx)
2197 {
2198         struct inode *ino = data->dentry->d_inode;
2199         struct pnfs_layout_range rng = {
2200                 .iomode = (data->o_arg.fmode & FMODE_WRITE) ?
2201                           IOMODE_RW: IOMODE_READ,
2202                 .offset = 0,
2203                 .length = NFS4_MAX_UINT64,
2204         };
2205         struct nfs4_layoutget *lgp;
2206         struct pnfs_layout_hdr *lo;
2207
2208         /* Heuristic: don't send layoutget if we have cached data */
2209         if (rng.iomode == IOMODE_READ &&
2210            (i_size_read(ino) == 0 || ino->i_mapping->nrpages != 0))
2211                 return;
2212
2213         lo = _pnfs_grab_empty_layout(ino, ctx);
2214         if (!lo)
2215                 return;
2216         lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
2217                                              &rng, GFP_KERNEL);
2218         if (!lgp) {
2219                 pnfs_clear_first_layoutget(lo);
2220                 pnfs_put_layout_hdr(lo);
2221                 return;
2222         }
2223         data->lgp = lgp;
2224         data->o_arg.lg_args = &lgp->args;
2225         data->o_res.lg_res = &lgp->res;
2226 }
2227
2228 static void _lgopen_prepare_floating(struct nfs4_opendata *data,
2229                                      struct nfs_open_context *ctx)
2230 {
2231         struct pnfs_layout_range rng = {
2232                 .iomode = (data->o_arg.fmode & FMODE_WRITE) ?
2233                           IOMODE_RW: IOMODE_READ,
2234                 .offset = 0,
2235                 .length = NFS4_MAX_UINT64,
2236         };
2237         struct nfs4_layoutget *lgp;
2238
2239         lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, &current_stateid,
2240                                              &rng, GFP_KERNEL);
2241         if (!lgp)
2242                 return;
2243         data->lgp = lgp;
2244         data->o_arg.lg_args = &lgp->args;
2245         data->o_res.lg_res = &lgp->res;
2246 }
2247
2248 void pnfs_lgopen_prepare(struct nfs4_opendata *data,
2249                          struct nfs_open_context *ctx)
2250 {
2251         struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
2252
2253         if (!(pnfs_enabled_sb(server) &&
2254               server->pnfs_curr_ld->flags & PNFS_LAYOUTGET_ON_OPEN))
2255                 return;
2256         /* Could check on max_ops, but currently hardcoded high enough */
2257         if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN))
2258                 return;
2259         if (data->state)
2260                 _lgopen_prepare_attached(data, ctx);
2261         else
2262                 _lgopen_prepare_floating(data, ctx);
2263 }
2264
2265 void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
2266                        struct nfs_open_context *ctx)
2267 {
2268         struct pnfs_layout_hdr *lo;
2269         struct pnfs_layout_segment *lseg;
2270         struct nfs_server *srv = NFS_SERVER(ino);
2271         u32 iomode;
2272
2273         if (!lgp)
2274                 return;
2275         dprintk("%s: entered with status %i\n", __func__, lgp->res.status);
2276         if (lgp->res.status) {
2277                 switch (lgp->res.status) {
2278                 default:
2279                         break;
2280                 /*
2281                  * Halt lgopen attempts if the server doesn't recognise
2282                  * the "current stateid" value, the layout type, or the
2283                  * layoutget operation as being valid.
2284                  * Also if it complains about too many ops in the compound
2285                  * or of the request/reply being too big.
2286                  */
2287                 case -NFS4ERR_BAD_STATEID:
2288                 case -NFS4ERR_NOTSUPP:
2289                 case -NFS4ERR_REP_TOO_BIG:
2290                 case -NFS4ERR_REP_TOO_BIG_TO_CACHE:
2291                 case -NFS4ERR_REQ_TOO_BIG:
2292                 case -NFS4ERR_TOO_MANY_OPS:
2293                 case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
2294                         srv->caps &= ~NFS_CAP_LGOPEN;
2295                 }
2296                 return;
2297         }
2298         if (!lgp->args.inode) {
2299                 lo = _pnfs_grab_empty_layout(ino, ctx);
2300                 if (!lo)
2301                         return;
2302                 lgp->args.inode = ino;
2303         } else
2304                 lo = NFS_I(lgp->args.inode)->layout;
2305
2306         lseg = pnfs_layout_process(lgp);
2307         if (!IS_ERR(lseg)) {
2308                 iomode = lgp->args.range.iomode;
2309                 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
2310                 pnfs_put_lseg(lseg);
2311         }
2312 }
2313
2314 void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
2315 {
2316         if (lgp != NULL) {
2317                 struct inode *inode = lgp->args.inode;
2318                 if (inode) {
2319                         struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
2320                         pnfs_clear_first_layoutget(lo);
2321                         nfs_layoutget_end(lo);
2322                 }
2323                 pnfs_layoutget_free(lgp);
2324         }
2325 }
2326
2327 struct pnfs_layout_segment *
2328 pnfs_layout_process(struct nfs4_layoutget *lgp)
2329 {
2330         struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
2331         struct nfs4_layoutget_res *res = &lgp->res;
2332         struct pnfs_layout_segment *lseg;
2333         struct inode *ino = lo->plh_inode;
2334         LIST_HEAD(free_me);
2335
2336         if (!pnfs_sanity_check_layout_range(&res->range))
2337                 return ERR_PTR(-EINVAL);
2338
2339         /* Inject layout blob into I/O device driver */
2340         lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
2341         if (IS_ERR_OR_NULL(lseg)) {
2342                 if (!lseg)
2343                         lseg = ERR_PTR(-ENOMEM);
2344
2345                 dprintk("%s: Could not allocate layout: error %ld\n",
2346                        __func__, PTR_ERR(lseg));
2347                 return lseg;
2348         }
2349
2350         pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
2351
2352         spin_lock(&ino->i_lock);
2353         if (pnfs_layoutgets_blocked(lo)) {
2354                 dprintk("%s forget reply due to state\n", __func__);
2355                 goto out_forget;
2356         }
2357
2358         if (!pnfs_layout_is_valid(lo)) {
2359                 /* We have a completely new layout */
2360                 pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
2361         } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
2362                 /* existing state ID, make sure the sequence number matches. */
2363                 if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
2364                         dprintk("%s forget reply due to sequence\n", __func__);
2365                         goto out_forget;
2366                 }
2367                 pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false);
2368         } else {
2369                 /*
2370                  * We got an entirely new state ID.  Mark all segments for the
2371                  * inode invalid, and retry the layoutget
2372                  */
2373                 pnfs_mark_layout_stateid_invalid(lo, &free_me);
2374                 goto out_forget;
2375         }
2376
2377         pnfs_get_lseg(lseg);
2378         pnfs_layout_insert_lseg(lo, lseg, &free_me);
2379
2380
2381         if (res->return_on_close)
2382                 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
2383
2384         spin_unlock(&ino->i_lock);
2385         pnfs_free_lseg_list(&free_me);
2386         return lseg;
2387
2388 out_forget:
2389         spin_unlock(&ino->i_lock);
2390         lseg->pls_layout = lo;
2391         NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
2392         return ERR_PTR(-EAGAIN);
2393 }
2394
2395 static int
2396 mark_lseg_invalid_or_return(struct pnfs_layout_segment *lseg,
2397                 struct list_head *tmp_list)
2398 {
2399         if (!mark_lseg_invalid(lseg, tmp_list))
2400                 return 0;
2401         pnfs_cache_lseg_for_layoutreturn(lseg->pls_layout, lseg);
2402         return 1;
2403 }
2404
2405 /**
2406  * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
2407  * @lo: pointer to layout header
2408  * @tmp_list: list header to be used with pnfs_free_lseg_list()
2409  * @return_range: describe layout segment ranges to be returned
2410  * @seq: stateid seqid to match
2411  *
2412  * This function is mainly intended for use by layoutrecall. It attempts
2413  * to free the layout segment immediately, or else to mark it for return
2414  * as soon as its reference count drops to zero.
2415  *
2416  * Returns
2417  * - 0: a layoutreturn needs to be scheduled.
2418  * - EBUSY: there are layout segment that are still in use.
2419  * - ENOENT: there are no layout segments that need to be returned.
2420  */
2421 int
2422 pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
2423                                 struct list_head *tmp_list,
2424                                 const struct pnfs_layout_range *return_range,
2425                                 u32 seq)
2426 {
2427         struct pnfs_layout_segment *lseg, *next;
2428         int remaining = 0;
2429
2430         dprintk("%s:Begin lo %p\n", __func__, lo);
2431
2432         assert_spin_locked(&lo->plh_inode->i_lock);
2433
2434         list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
2435                 if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
2436                         dprintk("%s: marking lseg %p iomode %d "
2437                                 "offset %llu length %llu\n", __func__,
2438                                 lseg, lseg->pls_range.iomode,
2439                                 lseg->pls_range.offset,
2440                                 lseg->pls_range.length);
2441                         if (mark_lseg_invalid_or_return(lseg, tmp_list))
2442                                 continue;
2443                         remaining++;
2444                         set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
2445                 }
2446
2447         if (remaining) {
2448                 pnfs_set_plh_return_info(lo, return_range->iomode, seq);
2449                 return -EBUSY;
2450         }
2451
2452         if (!list_empty(&lo->plh_return_segs)) {
2453                 pnfs_set_plh_return_info(lo, return_range->iomode, seq);
2454                 return 0;
2455         }
2456
2457         return -ENOENT;
2458 }
2459
2460 static void
2461 pnfs_mark_layout_for_return(struct inode *inode,
2462                             const struct pnfs_layout_range *range)
2463 {
2464         struct pnfs_layout_hdr *lo;
2465         bool return_now = false;
2466
2467         spin_lock(&inode->i_lock);
2468         lo = NFS_I(inode)->layout;
2469         if (!pnfs_layout_is_valid(lo)) {
2470                 spin_unlock(&inode->i_lock);
2471                 return;
2472         }
2473         pnfs_set_plh_return_info(lo, range->iomode, 0);
2474         /*
2475          * mark all matching lsegs so that we are sure to have no live
2476          * segments at hand when sending layoutreturn. See pnfs_put_lseg()
2477          * for how it works.
2478          */
2479         if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) {
2480                 const struct cred *cred;
2481                 nfs4_stateid stateid;
2482                 enum pnfs_iomode iomode;
2483
2484                 return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
2485                 spin_unlock(&inode->i_lock);
2486                 if (return_now)
2487                         pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
2488         } else {
2489                 spin_unlock(&inode->i_lock);
2490                 nfs_commit_inode(inode, 0);
2491         }
2492 }
2493
2494 void pnfs_error_mark_layout_for_return(struct inode *inode,
2495                                        struct pnfs_layout_segment *lseg)
2496 {
2497         struct pnfs_layout_range range = {
2498                 .iomode = lseg->pls_range.iomode,
2499                 .offset = 0,
2500                 .length = NFS4_MAX_UINT64,
2501         };
2502
2503         pnfs_mark_layout_for_return(inode, &range);
2504 }
2505 EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
2506
2507 static bool
2508 pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo)
2509 {
2510         return pnfs_layout_is_valid(lo) &&
2511                 !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) &&
2512                 !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
2513 }
2514
2515 static struct pnfs_layout_segment *
2516 pnfs_find_first_lseg(struct pnfs_layout_hdr *lo,
2517                      const struct pnfs_layout_range *range,
2518                      enum pnfs_iomode iomode)
2519 {
2520         struct pnfs_layout_segment *lseg;
2521
2522         list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
2523                 if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
2524                         continue;
2525                 if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
2526                         continue;
2527                 if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY)
2528                         continue;
2529                 if (pnfs_lseg_range_intersecting(&lseg->pls_range, range))
2530                         return lseg;
2531         }
2532         return NULL;
2533 }
2534
2535 /* Find open file states whose mode matches that of the range */
2536 static bool
2537 pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
2538                                  const struct pnfs_layout_range *range)
2539 {
2540         struct list_head *head;
2541         struct nfs_open_context *ctx;
2542         fmode_t mode = 0;
2543
2544         if (!pnfs_layout_can_be_returned(lo) ||
2545             !pnfs_find_first_lseg(lo, range, range->iomode))
2546                 return false;
2547
2548         head = &NFS_I(lo->plh_inode)->open_files;
2549         list_for_each_entry_rcu(ctx, head, list) {
2550                 if (ctx->state)
2551                         mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE);
2552         }
2553
2554         switch (range->iomode) {
2555         default:
2556                 break;
2557         case IOMODE_READ:
2558                 mode &= ~FMODE_WRITE;
2559                 break;
2560         case IOMODE_RW:
2561                 if (pnfs_find_first_lseg(lo, range, IOMODE_READ))
2562                         mode &= ~FMODE_READ;
2563         }
2564         return mode == 0;
2565 }
2566
2567 static int
2568 pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data)
2569 {
2570         const struct pnfs_layout_range *range = data;
2571         struct pnfs_layout_hdr *lo;
2572         struct inode *inode;
2573 restart:
2574         rcu_read_lock();
2575         list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
2576                 if (!pnfs_layout_can_be_returned(lo) ||
2577                     test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
2578                         continue;
2579                 inode = lo->plh_inode;
2580                 spin_lock(&inode->i_lock);
2581                 if (!pnfs_should_return_unused_layout(lo, range)) {
2582                         spin_unlock(&inode->i_lock);
2583                         continue;
2584                 }
2585                 spin_unlock(&inode->i_lock);
2586                 inode = pnfs_grab_inode_layout_hdr(lo);
2587                 if (!inode)
2588                         continue;
2589                 rcu_read_unlock();
2590                 pnfs_mark_layout_for_return(inode, range);
2591                 iput(inode);
2592                 cond_resched();
2593                 goto restart;
2594         }
2595         rcu_read_unlock();
2596         return 0;
2597 }
2598
2599 void
2600 pnfs_layout_return_unused_byclid(struct nfs_client *clp,
2601                                  enum pnfs_iomode iomode)
2602 {
2603         struct pnfs_layout_range range = {
2604                 .iomode = iomode,
2605                 .offset = 0,
2606                 .length = NFS4_MAX_UINT64,
2607         };
2608
2609         nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver,
2610                         &range);
2611 }
2612
2613 void
2614 pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
2615 {
2616         if (pgio->pg_lseg == NULL ||
2617             test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags))
2618                 return;
2619         pnfs_put_lseg(pgio->pg_lseg);
2620         pgio->pg_lseg = NULL;
2621 }
2622 EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
2623
2624 /*
2625  * Check for any intersection between the request and the pgio->pg_lseg,
2626  * and if none, put this pgio->pg_lseg away.
2627  */
2628 void
2629 pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
2630 {
2631         if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
2632                 pnfs_put_lseg(pgio->pg_lseg);
2633                 pgio->pg_lseg = NULL;
2634         }
2635 }
2636 EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
2637
2638 void
2639 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
2640 {
2641         u64 rd_size = req->wb_bytes;
2642
2643         pnfs_generic_pg_check_layout(pgio);
2644         pnfs_generic_pg_check_range(pgio, req);
2645         if (pgio->pg_lseg == NULL) {
2646                 if (pgio->pg_dreq == NULL)
2647                         rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
2648                 else
2649                         rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
2650
2651                 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
2652                                                    nfs_req_openctx(req),
2653                                                    req_offset(req),
2654                                                    rd_size,
2655                                                    IOMODE_READ,
2656                                                    false,
2657                                                    GFP_KERNEL);
2658                 if (IS_ERR(pgio->pg_lseg)) {
2659                         pgio->pg_error = PTR_ERR(pgio->pg_lseg);
2660                         pgio->pg_lseg = NULL;
2661                         return;
2662                 }
2663         }
2664         /* If no lseg, fall back to read through mds */
2665         if (pgio->pg_lseg == NULL)
2666                 nfs_pageio_reset_read_mds(pgio);
2667
2668 }
2669 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
2670
2671 void
2672 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
2673                            struct nfs_page *req, u64 wb_size)
2674 {
2675         pnfs_generic_pg_check_layout(pgio);
2676         pnfs_generic_pg_check_range(pgio, req);
2677         if (pgio->pg_lseg == NULL) {
2678                 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
2679                                                    nfs_req_openctx(req),
2680                                                    req_offset(req),
2681                                                    wb_size,
2682                                                    IOMODE_RW,
2683                                                    false,
2684                                                    GFP_KERNEL);
2685                 if (IS_ERR(pgio->pg_lseg)) {
2686                         pgio->pg_error = PTR_ERR(pgio->pg_lseg);
2687                         pgio->pg_lseg = NULL;
2688                         return;
2689                 }
2690         }
2691         /* If no lseg, fall back to write through mds */
2692         if (pgio->pg_lseg == NULL)
2693                 nfs_pageio_reset_write_mds(pgio);
2694 }
2695 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
2696
2697 void
2698 pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
2699 {
2700         if (desc->pg_lseg) {
2701                 pnfs_put_lseg(desc->pg_lseg);
2702                 desc->pg_lseg = NULL;
2703         }
2704 }
2705 EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
2706
2707 /*
2708  * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
2709  * of bytes (maximum @req->wb_bytes) that can be coalesced.
2710  */
2711 size_t
2712 pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
2713                      struct nfs_page *prev, struct nfs_page *req)
2714 {
2715         unsigned int size;
2716         u64 seg_end, req_start, seg_left;
2717
2718         size = nfs_generic_pg_test(pgio, prev, req);
2719         if (!size)
2720                 return 0;
2721
2722         /*
2723          * 'size' contains the number of bytes left in the current page (up
2724          * to the original size asked for in @req->wb_bytes).
2725          *
2726          * Calculate how many bytes are left in the layout segment
2727          * and if there are less bytes than 'size', return that instead.
2728          *
2729          * Please also note that 'end_offset' is actually the offset of the
2730          * first byte that lies outside the pnfs_layout_range. FIXME?
2731          *
2732          */
2733         if (pgio->pg_lseg) {
2734                 seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset,
2735                                      pgio->pg_lseg->pls_range.length);
2736                 req_start = req_offset(req);
2737
2738                 /* start of request is past the last byte of this segment */
2739                 if (req_start >= seg_end)
2740                         return 0;
2741
2742                 /* adjust 'size' iff there are fewer bytes left in the
2743                  * segment than what nfs_generic_pg_test returned */
2744                 seg_left = seg_end - req_start;
2745                 if (seg_left < size)
2746                         size = (unsigned int)seg_left;
2747         }
2748
2749         return size;
2750 }
2751 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
2752
2753 int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
2754 {
2755         struct nfs_pageio_descriptor pgio;
2756
2757         /* Resend all requests through the MDS */
2758         nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
2759                               hdr->completion_ops);
2760         set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
2761         return nfs_pageio_resend(&pgio, hdr);
2762 }
2763 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
2764
2765 static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
2766 {
2767
2768         dprintk("pnfs write error = %d\n", hdr->pnfs_error);
2769         if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2770             PNFS_LAYOUTRET_ON_ERROR) {
2771                 pnfs_return_layout(hdr->inode);
2772         }
2773         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2774                 hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
2775 }
2776
2777 /*
2778  * Called by non rpc-based layout drivers
2779  */
2780 void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
2781 {
2782         if (likely(!hdr->pnfs_error)) {
2783                 pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
2784                                 hdr->mds_offset + hdr->res.count);
2785                 hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2786         }
2787         trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
2788         if (unlikely(hdr->pnfs_error))
2789                 pnfs_ld_handle_write_error(hdr);
2790         hdr->mds_ops->rpc_release(hdr);
2791 }
2792 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
2793
2794 static void
2795 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
2796                 struct nfs_pgio_header *hdr)
2797 {
2798         struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2799
2800         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2801                 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2802                 nfs_pageio_reset_write_mds(desc);
2803                 mirror->pg_recoalesce = 1;
2804         }
2805         hdr->completion_ops->completion(hdr);
2806 }
2807
2808 static enum pnfs_try_status
2809 pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
2810                         const struct rpc_call_ops *call_ops,
2811                         struct pnfs_layout_segment *lseg,
2812                         int how)
2813 {
2814         struct inode *inode = hdr->inode;
2815         enum pnfs_try_status trypnfs;
2816         struct nfs_server *nfss = NFS_SERVER(inode);
2817
2818         hdr->mds_ops = call_ops;
2819
2820         dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
2821                 inode->i_ino, hdr->args.count, hdr->args.offset, how);
2822         trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
2823         if (trypnfs != PNFS_NOT_ATTEMPTED)
2824                 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
2825         dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2826         return trypnfs;
2827 }
2828
2829 static void
2830 pnfs_do_write(struct nfs_pageio_descriptor *desc,
2831               struct nfs_pgio_header *hdr, int how)
2832 {
2833         const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2834         struct pnfs_layout_segment *lseg = desc->pg_lseg;
2835         enum pnfs_try_status trypnfs;
2836
2837         trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
2838         switch (trypnfs) {
2839         case PNFS_NOT_ATTEMPTED:
2840                 pnfs_write_through_mds(desc, hdr);
2841         case PNFS_ATTEMPTED:
2842                 break;
2843         case PNFS_TRY_AGAIN:
2844                 /* cleanup hdr and prepare to redo pnfs */
2845                 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2846                         struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2847                         list_splice_init(&hdr->pages, &mirror->pg_list);
2848                         mirror->pg_recoalesce = 1;
2849                 }
2850                 hdr->mds_ops->rpc_release(hdr);
2851         }
2852 }
2853
2854 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
2855 {
2856         pnfs_put_lseg(hdr->lseg);
2857         nfs_pgio_header_free(hdr);
2858 }
2859
2860 int
2861 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
2862 {
2863         struct nfs_pgio_header *hdr;
2864         int ret;
2865
2866         hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2867         if (!hdr) {
2868                 desc->pg_error = -ENOMEM;
2869                 return desc->pg_error;
2870         }
2871         nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
2872
2873         hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2874         ret = nfs_generic_pgio(desc, hdr);
2875         if (!ret)
2876                 pnfs_do_write(desc, hdr, desc->pg_ioflags);
2877
2878         return ret;
2879 }
2880 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
2881
2882 int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
2883 {
2884         struct nfs_pageio_descriptor pgio;
2885
2886         /* Resend all requests through the MDS */
2887         nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
2888         return nfs_pageio_resend(&pgio, hdr);
2889 }
2890 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
2891
2892 static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
2893 {
2894         dprintk("pnfs read error = %d\n", hdr->pnfs_error);
2895         if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2896             PNFS_LAYOUTRET_ON_ERROR) {
2897                 pnfs_return_layout(hdr->inode);
2898         }
2899         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2900                 hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
2901 }
2902
2903 /*
2904  * Called by non rpc-based layout drivers
2905  */
2906 void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
2907 {
2908         if (likely(!hdr->pnfs_error))
2909                 hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2910         trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
2911         if (unlikely(hdr->pnfs_error))
2912                 pnfs_ld_handle_read_error(hdr);
2913         hdr->mds_ops->rpc_release(hdr);
2914 }
2915 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
2916
2917 static void
2918 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
2919                 struct nfs_pgio_header *hdr)
2920 {
2921         struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2922
2923         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2924                 list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2925                 nfs_pageio_reset_read_mds(desc);
2926                 mirror->pg_recoalesce = 1;
2927         }
2928         hdr->completion_ops->completion(hdr);
2929 }
2930
2931 /*
2932  * Call the appropriate parallel I/O subsystem read function.
2933  */
2934 static enum pnfs_try_status
2935 pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
2936                        const struct rpc_call_ops *call_ops,
2937                        struct pnfs_layout_segment *lseg)
2938 {
2939         struct inode *inode = hdr->inode;
2940         struct nfs_server *nfss = NFS_SERVER(inode);
2941         enum pnfs_try_status trypnfs;
2942
2943         hdr->mds_ops = call_ops;
2944
2945         dprintk("%s: Reading ino:%lu %u@%llu\n",
2946                 __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
2947
2948         trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
2949         if (trypnfs != PNFS_NOT_ATTEMPTED)
2950                 nfs_inc_stats(inode, NFSIOS_PNFS_READ);
2951         dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2952         return trypnfs;
2953 }
2954
2955 /* Resend all requests through pnfs. */
2956 void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2957 {
2958         struct nfs_pageio_descriptor pgio;
2959
2960         if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2961                 /* Prevent deadlocks with layoutreturn! */
2962                 pnfs_put_lseg(hdr->lseg);
2963                 hdr->lseg = NULL;
2964
2965                 nfs_pageio_init_read(&pgio, hdr->inode, false,
2966                                         hdr->completion_ops);
2967                 hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
2968         }
2969 }
2970 EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2971
2972 static void
2973 pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
2974 {
2975         const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2976         struct pnfs_layout_segment *lseg = desc->pg_lseg;
2977         enum pnfs_try_status trypnfs;
2978
2979         trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
2980         switch (trypnfs) {
2981         case PNFS_NOT_ATTEMPTED:
2982                 pnfs_read_through_mds(desc, hdr);
2983         case PNFS_ATTEMPTED:
2984                 break;
2985         case PNFS_TRY_AGAIN:
2986                 /* cleanup hdr and prepare to redo pnfs */
2987                 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2988                         struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2989                         list_splice_init(&hdr->pages, &mirror->pg_list);
2990                         mirror->pg_recoalesce = 1;
2991                 }
2992                 hdr->mds_ops->rpc_release(hdr);
2993         }
2994 }
2995
2996 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
2997 {
2998         pnfs_put_lseg(hdr->lseg);
2999         nfs_pgio_header_free(hdr);
3000 }
3001
3002 int
3003 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
3004 {
3005         struct nfs_pgio_header *hdr;
3006         int ret;
3007
3008         hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
3009         if (!hdr) {
3010                 desc->pg_error = -ENOMEM;
3011                 return desc->pg_error;
3012         }
3013         nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
3014         hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
3015         ret = nfs_generic_pgio(desc, hdr);
3016         if (!ret)
3017                 pnfs_do_read(desc, hdr);
3018         return ret;
3019 }
3020 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
3021
3022 static void pnfs_clear_layoutcommitting(struct inode *inode)
3023 {
3024         unsigned long *bitlock = &NFS_I(inode)->flags;
3025
3026         clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
3027         smp_mb__after_atomic();
3028         wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
3029 }
3030
3031 /*
3032  * There can be multiple RW segments.
3033  */
3034 static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
3035 {
3036         struct pnfs_layout_segment *lseg;
3037
3038         list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
3039                 if (lseg->pls_range.iomode == IOMODE_RW &&
3040                     test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
3041                         list_add(&lseg->pls_lc_list, listp);
3042         }
3043 }
3044
3045 static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
3046 {
3047         struct pnfs_layout_segment *lseg, *tmp;
3048
3049         /* Matched by references in pnfs_set_layoutcommit */
3050         list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
3051                 list_del_init(&lseg->pls_lc_list);
3052                 pnfs_put_lseg(lseg);
3053         }
3054
3055         pnfs_clear_layoutcommitting(inode);
3056 }
3057
3058 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
3059 {
3060         pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
3061 }
3062 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
3063
3064 void
3065 pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
3066                 loff_t end_pos)
3067 {
3068         struct nfs_inode *nfsi = NFS_I(inode);
3069         bool mark_as_dirty = false;
3070
3071         spin_lock(&inode->i_lock);
3072         if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
3073                 nfsi->layout->plh_lwb = end_pos;
3074                 mark_as_dirty = true;
3075                 dprintk("%s: Set layoutcommit for inode %lu ",
3076                         __func__, inode->i_ino);
3077         } else if (end_pos > nfsi->layout->plh_lwb)
3078                 nfsi->layout->plh_lwb = end_pos;
3079         if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
3080                 /* references matched in nfs4_layoutcommit_release */
3081                 pnfs_get_lseg(lseg);
3082         }
3083         spin_unlock(&inode->i_lock);
3084         dprintk("%s: lseg %p end_pos %llu\n",
3085                 __func__, lseg, nfsi->layout->plh_lwb);
3086
3087         /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
3088          * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
3089         if (mark_as_dirty)
3090                 mark_inode_dirty_sync(inode);
3091 }
3092 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
3093
3094 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
3095 {
3096         struct nfs_server *nfss = NFS_SERVER(data->args.inode);
3097
3098         if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
3099                 nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
3100         pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
3101 }
3102
3103 /*
3104  * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
3105  * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
3106  * data to disk to allow the server to recover the data if it crashes.
3107  * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
3108  * is off, and a COMMIT is sent to a data server, or
3109  * if WRITEs to a data server return NFS_DATA_SYNC.
3110  */
3111 int
3112 pnfs_layoutcommit_inode(struct inode *inode, bool sync)
3113 {
3114         struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
3115         struct nfs4_layoutcommit_data *data;
3116         struct nfs_inode *nfsi = NFS_I(inode);
3117         loff_t end_pos;
3118         int status;
3119
3120         if (!pnfs_layoutcommit_outstanding(inode))
3121                 return 0;
3122
3123         dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
3124
3125         status = -EAGAIN;
3126         if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
3127                 if (!sync)
3128                         goto out;
3129                 status = wait_on_bit_lock_action(&nfsi->flags,
3130                                 NFS_INO_LAYOUTCOMMITTING,
3131                                 nfs_wait_bit_killable,
3132                                 TASK_KILLABLE);
3133                 if (status)
3134                         goto out;
3135         }
3136
3137         status = -ENOMEM;
3138         /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
3139         data = kzalloc(sizeof(*data), GFP_NOFS);
3140         if (!data)
3141                 goto clear_layoutcommitting;
3142
3143         status = 0;
3144         spin_lock(&inode->i_lock);
3145         if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
3146                 goto out_unlock;
3147
3148         INIT_LIST_HEAD(&data->lseg_list);
3149         pnfs_list_write_lseg(inode, &data->lseg_list);
3150
3151         end_pos = nfsi->layout->plh_lwb;
3152
3153         nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
3154         data->cred = get_cred(nfsi->layout->plh_lc_cred);
3155         spin_unlock(&inode->i_lock);
3156
3157         data->args.inode = inode;
3158         nfs_fattr_init(&data->fattr);
3159         data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
3160         data->res.fattr = &data->fattr;
3161         if (end_pos != 0)
3162                 data->args.lastbytewritten = end_pos - 1;
3163         else
3164                 data->args.lastbytewritten = U64_MAX;
3165         data->res.server = NFS_SERVER(inode);
3166
3167         if (ld->prepare_layoutcommit) {
3168                 status = ld->prepare_layoutcommit(&data->args);
3169                 if (status) {
3170                         put_cred(data->cred);
3171                         spin_lock(&inode->i_lock);
3172                         set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
3173                         if (end_pos > nfsi->layout->plh_lwb)
3174                                 nfsi->layout->plh_lwb = end_pos;
3175                         goto out_unlock;
3176                 }
3177         }
3178
3179
3180         status = nfs4_proc_layoutcommit(data, sync);
3181 out:
3182         if (status)
3183                 mark_inode_dirty_sync(inode);
3184         dprintk("<-- %s status %d\n", __func__, status);
3185         return status;
3186 out_unlock:
3187         spin_unlock(&inode->i_lock);
3188         kfree(data);
3189 clear_layoutcommitting:
3190         pnfs_clear_layoutcommitting(inode);
3191         goto out;
3192 }
3193 EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
3194
3195 int
3196 pnfs_generic_sync(struct inode *inode, bool datasync)
3197 {
3198         return pnfs_layoutcommit_inode(inode, true);
3199 }
3200 EXPORT_SYMBOL_GPL(pnfs_generic_sync);
3201
3202 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
3203 {
3204         struct nfs4_threshold *thp;
3205
3206         thp = kzalloc(sizeof(*thp), GFP_NOFS);
3207         if (!thp) {
3208                 dprintk("%s mdsthreshold allocation failed\n", __func__);
3209                 return NULL;
3210         }
3211         return thp;
3212 }
3213
3214 #if IS_ENABLED(CONFIG_NFS_V4_2)
3215 int
3216 pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
3217 {
3218         struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
3219         struct nfs_server *server = NFS_SERVER(inode);
3220         struct nfs_inode *nfsi = NFS_I(inode);
3221         struct nfs42_layoutstat_data *data;
3222         struct pnfs_layout_hdr *hdr;
3223         int status = 0;
3224
3225         if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
3226                 goto out;
3227
3228         if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
3229                 goto out;
3230
3231         if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
3232                 goto out;
3233
3234         spin_lock(&inode->i_lock);
3235         if (!NFS_I(inode)->layout) {
3236                 spin_unlock(&inode->i_lock);
3237                 goto out_clear_layoutstats;
3238         }
3239         hdr = NFS_I(inode)->layout;
3240         pnfs_get_layout_hdr(hdr);
3241         spin_unlock(&inode->i_lock);
3242
3243         data = kzalloc(sizeof(*data), gfp_flags);
3244         if (!data) {
3245                 status = -ENOMEM;
3246                 goto out_put;
3247         }
3248
3249         data->args.fh = NFS_FH(inode);
3250         data->args.inode = inode;
3251         status = ld->prepare_layoutstats(&data->args);
3252         if (status)
3253                 goto out_free;
3254
3255         status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
3256
3257 out:
3258         dprintk("%s returns %d\n", __func__, status);
3259         return status;
3260
3261 out_free:
3262         kfree(data);
3263 out_put:
3264         pnfs_put_layout_hdr(hdr);
3265 out_clear_layoutstats:
3266         smp_mb__before_atomic();
3267         clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
3268         smp_mb__after_atomic();
3269         goto out;
3270 }
3271 EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
3272 #endif
3273
3274 unsigned int layoutstats_timer;
3275 module_param(layoutstats_timer, uint, 0644);
3276 EXPORT_SYMBOL_GPL(layoutstats_timer);