4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include "../include/lustre_dlm.h"
45 #include "../include/lustre_lite.h"
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include "../include/lustre/ll_fiemap.h"
51 #include "../include/cl_object.h"
54 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
56 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
59 static enum llioc_iter
60 ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
63 static struct ll_file_data *ll_file_data_get(void)
65 struct ll_file_data *fd;
67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
74 static void ll_file_data_put(struct ll_file_data *fd)
77 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
80 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 struct lustre_handle *fh)
83 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 op_data->op_attr.ia_mode = inode->i_mode;
85 op_data->op_attr.ia_atime = inode->i_atime;
86 op_data->op_attr.ia_mtime = inode->i_mtime;
87 op_data->op_attr.ia_ctime = inode->i_ctime;
88 op_data->op_attr.ia_size = i_size_read(inode);
89 op_data->op_attr_blocks = inode->i_blocks;
90 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 ll_inode_to_ext_flags(inode->i_flags);
92 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
94 op_data->op_handle = *fh;
95 op_data->op_capa1 = ll_mdscapa_get(inode);
97 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
98 op_data->op_bias |= MDS_DATA_MODIFIED;
102 * Closes the IO epoch and packs all the attributes into @op_data for
105 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
106 struct obd_client_handle *och)
108 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
109 ATTR_MTIME | ATTR_MTIME_SET |
110 ATTR_CTIME | ATTR_CTIME_SET;
112 if (!(och->och_flags & FMODE_WRITE))
115 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
116 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
118 ll_ioepoch_close(inode, op_data, &och, 0);
121 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
122 ll_prep_md_op_data(op_data, inode, NULL, NULL,
123 0, 0, LUSTRE_OPC_ANY, NULL);
126 static int ll_close_inode_openhandle(struct obd_export *md_exp,
128 struct obd_client_handle *och,
129 const __u64 *data_version)
131 struct obd_export *exp = ll_i2mdexp(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
134 struct obd_device *obd = class_exp2obd(exp);
140 * XXX: in case of LMV, is this correct to access
143 CERROR("Invalid MDC connection handle %#llx\n",
144 ll_i2mdexp(inode)->exp_handle.h_cookie);
149 op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
151 /* XXX We leak openhandle and request here. */
156 ll_prepare_close(inode, op_data, och);
157 if (data_version != NULL) {
158 /* Pass in data_version implies release. */
159 op_data->op_bias |= MDS_HSM_RELEASE;
160 op_data->op_data_version = *data_version;
161 op_data->op_lease_handle = och->och_lease_handle;
162 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
164 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
165 rc = md_close(md_exp, op_data, och->och_mod, &req);
167 /* This close must have the epoch closed. */
168 LASSERT(epoch_close);
169 /* MDS has instructed us to obtain Size-on-MDS attribute from
170 * OSTs and send setattr to back to MDS. */
171 rc = ll_som_update(inode, op_data);
173 CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
178 CERROR("inode %lu mdc close failed: rc = %d\n",
182 /* DATA_MODIFIED flag was successfully sent on close, cancel data
183 * modification flag. */
184 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
185 struct ll_inode_info *lli = ll_i2info(inode);
187 spin_lock(&lli->lli_lock);
188 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
189 spin_unlock(&lli->lli_lock);
193 rc = ll_objects_destroy(req, inode);
195 CERROR("inode %lu ll_objects destroy: rc = %d\n",
198 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
199 struct mdt_body *body;
200 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
201 if (!(body->valid & OBD_MD_FLRELEASED))
205 ll_finish_md_op_data(op_data);
208 if (exp_connect_som(exp) && !epoch_close &&
209 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
212 md_clear_open_replay_data(md_exp, och);
213 /* Free @och if it is not waiting for DONE_WRITING. */
214 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
217 if (req) /* This is close request */
218 ptlrpc_req_finished(req);
222 int ll_md_real_close(struct inode *inode, fmode_t fmode)
224 struct ll_inode_info *lli = ll_i2info(inode);
225 struct obd_client_handle **och_p;
226 struct obd_client_handle *och;
230 if (fmode & FMODE_WRITE) {
231 och_p = &lli->lli_mds_write_och;
232 och_usecount = &lli->lli_open_fd_write_count;
233 } else if (fmode & FMODE_EXEC) {
234 och_p = &lli->lli_mds_exec_och;
235 och_usecount = &lli->lli_open_fd_exec_count;
237 LASSERT(fmode & FMODE_READ);
238 och_p = &lli->lli_mds_read_och;
239 och_usecount = &lli->lli_open_fd_read_count;
242 mutex_lock(&lli->lli_och_mutex);
243 if (*och_usecount > 0) {
244 /* There are still users of this handle, so skip
246 mutex_unlock(&lli->lli_och_mutex);
252 mutex_unlock(&lli->lli_och_mutex);
255 /* There might be a race and this handle may already
257 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
264 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
267 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
268 struct ll_inode_info *lli = ll_i2info(inode);
270 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
271 struct lustre_handle lockh;
272 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
275 /* clear group lock, if present */
276 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
277 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
279 if (fd->fd_lease_och != NULL) {
282 /* Usually the lease is not released when the
283 * application crashed, we need to release here. */
284 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
285 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
286 PFID(&lli->lli_fid), rc, lease_broken);
288 fd->fd_lease_och = NULL;
291 if (fd->fd_och != NULL) {
292 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
297 /* Let's see if we have good enough OPEN lock on the file and if
298 we can skip talking to MDS */
300 mutex_lock(&lli->lli_och_mutex);
301 if (fd->fd_omode & FMODE_WRITE) {
303 LASSERT(lli->lli_open_fd_write_count);
304 lli->lli_open_fd_write_count--;
305 } else if (fd->fd_omode & FMODE_EXEC) {
307 LASSERT(lli->lli_open_fd_exec_count);
308 lli->lli_open_fd_exec_count--;
311 LASSERT(lli->lli_open_fd_read_count);
312 lli->lli_open_fd_read_count--;
314 mutex_unlock(&lli->lli_och_mutex);
316 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
317 LDLM_IBITS, &policy, lockmode, &lockh))
318 rc = ll_md_real_close(inode, fd->fd_omode);
321 LUSTRE_FPRIVATE(file) = NULL;
322 ll_file_data_put(fd);
323 ll_capa_close(inode);
328 /* While this returns an error code, fput() the caller does not, so we need
329 * to make every effort to clean up all of our state here. Also, applications
330 * rarely check close errors and even if an error is returned they will not
331 * re-try the close call.
333 int ll_file_release(struct inode *inode, struct file *file)
335 struct ll_file_data *fd;
336 struct ll_sb_info *sbi = ll_i2sbi(inode);
337 struct ll_inode_info *lli = ll_i2info(inode);
340 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
341 inode->i_generation, inode);
343 #ifdef CONFIG_FS_POSIX_ACL
344 if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
345 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
348 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
349 fd->fd_flags &= ~LL_FILE_RMTACL;
350 rct_del(&sbi->ll_rct, current_pid());
351 et_search_free(&sbi->ll_et, current_pid());
356 if (!is_root_inode(inode))
357 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
358 fd = LUSTRE_FPRIVATE(file);
361 /* The last ref on @file, maybe not the owner pid of statahead.
362 * Different processes can open the same dir, "ll_opendir_key" means:
363 * it is me that should stop the statahead thread. */
364 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
365 lli->lli_opendir_pid != 0)
366 ll_stop_statahead(inode, lli->lli_opendir_key);
368 if (is_root_inode(inode)) {
369 LUSTRE_FPRIVATE(file) = NULL;
370 ll_file_data_put(fd);
374 if (!S_ISDIR(inode->i_mode)) {
375 lov_read_and_clear_async_rc(lli->lli_clob);
376 lli->lli_async_rc = 0;
379 rc = ll_md_close(sbi->ll_md_exp, inode, file);
381 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
382 libcfs_debug_dumplog();
387 static int ll_intent_file_open(struct dentry *dentry, void *lmm,
388 int lmmsize, struct lookup_intent *itp)
390 struct inode *inode = dentry->d_inode;
391 struct ll_sb_info *sbi = ll_i2sbi(inode);
392 struct dentry *parent = dentry->d_parent;
393 const char *name = dentry->d_name.name;
394 const int len = dentry->d_name.len;
395 struct md_op_data *op_data;
396 struct ptlrpc_request *req;
397 __u32 opc = LUSTRE_OPC_ANY;
400 /* Usually we come here only for NFSD, and we want open lock.
401 But we can also get here with pre 2.6.15 patchless kernels, and in
402 that case that lock is also ok */
403 /* We can also get here if there was cached open handle in revalidate_it
404 * but it disappeared while we were getting from there to ll_file_open.
405 * But this means this file was closed and immediately opened which
406 * makes a good candidate for using OPEN lock */
407 /* If lmmsize & lmm are not 0, we are just setting stripe info
408 * parameters. No need for the open lock */
409 if (lmm == NULL && lmmsize == 0) {
410 itp->it_flags |= MDS_OPEN_LOCK;
411 if (itp->it_flags & FMODE_WRITE)
412 opc = LUSTRE_OPC_CREATE;
415 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
419 return PTR_ERR(op_data);
421 itp->it_flags |= MDS_OPEN_BY_FID;
422 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
423 0 /*unused */, &req, ll_md_blocking_ast, 0);
424 ll_finish_md_op_data(op_data);
426 /* reason for keep own exit path - don`t flood log
427 * with messages with -ESTALE errors.
429 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
430 it_open_error(DISP_OPEN_OPEN, itp))
432 ll_release_openhandle(inode, itp);
436 if (it_disposition(itp, DISP_LOOKUP_NEG)) {
441 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
442 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
443 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
447 rc = ll_prep_inode(&inode, req, NULL, itp);
448 if (!rc && itp->d.lustre.it_lock_mode)
449 ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
452 ptlrpc_req_finished(req);
453 ll_intent_drop_lock(itp);
459 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
460 * not believe attributes if a few ioepoch holders exist. Attributes for
461 * previous ioepoch if new one is opened are also skipped by MDS.
463 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
465 if (ioepoch && lli->lli_ioepoch != ioepoch) {
466 lli->lli_ioepoch = ioepoch;
467 CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
468 ioepoch, PFID(&lli->lli_fid));
472 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
473 struct obd_client_handle *och)
475 struct ptlrpc_request *req = it->d.lustre.it_data;
476 struct mdt_body *body;
478 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
479 och->och_fh = body->handle;
480 och->och_fid = body->fid1;
481 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
482 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
483 och->och_flags = it->it_flags;
485 return md_set_open_replay_data(md_exp, och, it);
488 static int ll_local_open(struct file *file, struct lookup_intent *it,
489 struct ll_file_data *fd, struct obd_client_handle *och)
491 struct inode *inode = file_inode(file);
492 struct ll_inode_info *lli = ll_i2info(inode);
494 LASSERT(!LUSTRE_FPRIVATE(file));
499 struct ptlrpc_request *req = it->d.lustre.it_data;
500 struct mdt_body *body;
503 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
507 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
508 ll_ioepoch_open(lli, body->ioepoch);
511 LUSTRE_FPRIVATE(file) = fd;
512 ll_readahead_init(inode, &fd->fd_ras);
513 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
517 /* Open a file, and (for the very first open) create objects on the OSTs at
518 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
519 * creation or open until ll_lov_setstripe() ioctl is called.
521 * If we already have the stripe MD locally then we don't request it in
522 * md_open(), by passing a lmm_size = 0.
524 * It is up to the application to ensure no other processes open this file
525 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
526 * used. We might be able to avoid races of that sort by getting lli_open_sem
527 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
528 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
530 int ll_file_open(struct inode *inode, struct file *file)
532 struct ll_inode_info *lli = ll_i2info(inode);
533 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
534 .it_flags = file->f_flags };
535 struct obd_client_handle **och_p = NULL;
536 __u64 *och_usecount = NULL;
537 struct ll_file_data *fd;
538 int rc = 0, opendir_set = 0;
540 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
541 inode->i_generation, inode, file->f_flags);
543 it = file->private_data; /* XXX: compat macro */
544 file->private_data = NULL; /* prevent ll_local_open assertion */
546 fd = ll_file_data_get();
553 if (S_ISDIR(inode->i_mode)) {
554 spin_lock(&lli->lli_sa_lock);
555 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
556 lli->lli_opendir_pid == 0) {
557 lli->lli_opendir_key = fd;
558 lli->lli_opendir_pid = current_pid();
561 spin_unlock(&lli->lli_sa_lock);
564 if (is_root_inode(inode)) {
565 LUSTRE_FPRIVATE(file) = fd;
569 if (!it || !it->d.lustre.it_disposition) {
570 /* Convert f_flags into access mode. We cannot use file->f_mode,
571 * because everything but O_ACCMODE mask was stripped from
573 if ((oit.it_flags + 1) & O_ACCMODE)
575 if (file->f_flags & O_TRUNC)
576 oit.it_flags |= FMODE_WRITE;
578 /* kernel only call f_op->open in dentry_open. filp_open calls
579 * dentry_open after call to open_namei that checks permissions.
580 * Only nfsd_open call dentry_open directly without checking
581 * permissions and because of that this code below is safe. */
582 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
583 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
585 /* We do not want O_EXCL here, presumably we opened the file
586 * already? XXX - NFS implications? */
587 oit.it_flags &= ~O_EXCL;
589 /* bug20584, if "it_flags" contains O_CREAT, the file will be
590 * created if necessary, then "IT_CREAT" should be set to keep
591 * consistent with it */
592 if (oit.it_flags & O_CREAT)
593 oit.it_op |= IT_CREAT;
599 /* Let's see if we have file open on MDS already. */
600 if (it->it_flags & FMODE_WRITE) {
601 och_p = &lli->lli_mds_write_och;
602 och_usecount = &lli->lli_open_fd_write_count;
603 } else if (it->it_flags & FMODE_EXEC) {
604 och_p = &lli->lli_mds_exec_och;
605 och_usecount = &lli->lli_open_fd_exec_count;
607 och_p = &lli->lli_mds_read_och;
608 och_usecount = &lli->lli_open_fd_read_count;
611 mutex_lock(&lli->lli_och_mutex);
612 if (*och_p) { /* Open handle is present */
613 if (it_disposition(it, DISP_OPEN_OPEN)) {
614 /* Well, there's extra open request that we do not need,
615 let's close it somehow. This will decref request. */
616 rc = it_open_error(DISP_OPEN_OPEN, it);
618 mutex_unlock(&lli->lli_och_mutex);
622 ll_release_openhandle(inode, it);
626 rc = ll_local_open(file, it, fd, NULL);
629 mutex_unlock(&lli->lli_och_mutex);
633 LASSERT(*och_usecount == 0);
634 if (!it->d.lustre.it_disposition) {
635 /* We cannot just request lock handle now, new ELC code
636 means that one of other OPEN locks for this file
637 could be cancelled, and since blocking ast handler
638 would attempt to grab och_mutex as well, that would
639 result in a deadlock */
640 mutex_unlock(&lli->lli_och_mutex);
641 it->it_create_mode |= M_CHECK_STALE;
642 rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
643 it->it_create_mode &= ~M_CHECK_STALE;
649 *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
657 /* md_intent_lock() didn't get a request ref if there was an
658 * open error, so don't do cleanup on the request here
660 /* XXX (green): Should not we bail out on any error here, not
661 * just open error? */
662 rc = it_open_error(DISP_OPEN_OPEN, it);
666 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
668 rc = ll_local_open(file, it, fd, *och_p);
672 mutex_unlock(&lli->lli_och_mutex);
675 /* Must do this outside lli_och_mutex lock to prevent deadlock where
676 different kind of OPEN lock for this same inode gets cancelled
677 by ldlm_cancel_lru */
678 if (!S_ISREG(inode->i_mode))
683 if (!lli->lli_has_smd &&
684 (cl_is_lov_delay_create(file->f_flags) ||
685 (file->f_mode & FMODE_WRITE) == 0)) {
686 CDEBUG(D_INODE, "object creation was delayed\n");
689 cl_lov_delay_create_clear(&file->f_flags);
694 if (och_p && *och_p) {
695 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
696 *och_p = NULL; /* OBD_FREE writes some magic there */
699 mutex_unlock(&lli->lli_och_mutex);
702 if (opendir_set != 0)
703 ll_stop_statahead(inode, lli->lli_opendir_key);
705 ll_file_data_put(fd);
707 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
710 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
711 ptlrpc_req_finished(it->d.lustre.it_data);
712 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
718 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
719 struct ldlm_lock_desc *desc, void *data, int flag)
722 struct lustre_handle lockh;
725 case LDLM_CB_BLOCKING:
726 ldlm_lock2handle(lock, &lockh);
727 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
729 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
733 case LDLM_CB_CANCELING:
741 * Acquire a lease and open the file.
743 static struct obd_client_handle *
744 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
747 struct lookup_intent it = { .it_op = IT_OPEN };
748 struct ll_sb_info *sbi = ll_i2sbi(inode);
749 struct md_op_data *op_data;
750 struct ptlrpc_request *req;
751 struct lustre_handle old_handle = { 0 };
752 struct obd_client_handle *och = NULL;
756 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
757 return ERR_PTR(-EINVAL);
760 struct ll_inode_info *lli = ll_i2info(inode);
761 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
762 struct obd_client_handle **och_p;
765 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
766 return ERR_PTR(-EPERM);
768 /* Get the openhandle of the file */
770 mutex_lock(&lli->lli_och_mutex);
771 if (fd->fd_lease_och != NULL) {
772 mutex_unlock(&lli->lli_och_mutex);
776 if (fd->fd_och == NULL) {
777 if (file->f_mode & FMODE_WRITE) {
778 LASSERT(lli->lli_mds_write_och != NULL);
779 och_p = &lli->lli_mds_write_och;
780 och_usecount = &lli->lli_open_fd_write_count;
782 LASSERT(lli->lli_mds_read_och != NULL);
783 och_p = &lli->lli_mds_read_och;
784 och_usecount = &lli->lli_open_fd_read_count;
786 if (*och_usecount == 1) {
793 mutex_unlock(&lli->lli_och_mutex);
794 if (rc < 0) /* more than 1 opener */
797 LASSERT(fd->fd_och != NULL);
798 old_handle = fd->fd_och->och_fh;
801 och = kzalloc(sizeof(*och), GFP_NOFS);
803 return ERR_PTR(-ENOMEM);
805 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
806 LUSTRE_OPC_ANY, NULL);
807 if (IS_ERR(op_data)) {
808 rc = PTR_ERR(op_data);
812 /* To tell the MDT this openhandle is from the same owner */
813 op_data->op_handle = old_handle;
815 it.it_flags = fmode | open_flags;
816 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
817 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
818 ll_md_blocking_lease_ast,
819 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
820 * it can be cancelled which may mislead applications that the lease is
822 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
823 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
824 * doesn't deal with openhandle, so normal openhandle will be leaked. */
825 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
826 ll_finish_md_op_data(op_data);
827 ptlrpc_req_finished(req);
831 if (it_disposition(&it, DISP_LOOKUP_NEG)) {
836 rc = it_open_error(DISP_OPEN_OPEN, &it);
840 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
841 ll_och_fill(sbi->ll_md_exp, &it, och);
843 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
848 /* already get lease, handle lease lock */
849 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
850 if (it.d.lustre.it_lock_mode == 0 ||
851 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
852 /* open lock must return for lease */
853 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
854 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
855 it.d.lustre.it_lock_bits);
860 ll_intent_release(&it);
864 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
866 CERROR("Close openhandle returned %d\n", rc2);
868 /* cancel open lock */
869 if (it.d.lustre.it_lock_mode != 0) {
870 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
871 it.d.lustre.it_lock_mode);
872 it.d.lustre.it_lock_mode = 0;
875 ll_intent_release(&it);
882 * Release lease and close the file.
883 * It will check if the lease has ever broken.
885 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
888 struct ldlm_lock *lock;
889 bool cancelled = true;
892 lock = ldlm_handle2lock(&och->och_lease_handle);
894 lock_res_and_lock(lock);
895 cancelled = ldlm_is_cancel(lock);
896 unlock_res_and_lock(lock);
900 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
901 PFID(&ll_i2info(inode)->lli_fid), cancelled);
904 ldlm_cli_cancel(&och->och_lease_handle, 0);
905 if (lease_broken != NULL)
906 *lease_broken = cancelled;
908 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
913 /* Fills the obdo with the attributes for the lsm */
914 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
915 struct obd_capa *capa, struct obdo *obdo,
916 __u64 ioepoch, int sync)
918 struct ptlrpc_request_set *set;
919 struct obd_info oinfo = { { { 0 } } };
922 LASSERT(lsm != NULL);
926 oinfo.oi_oa->o_oi = lsm->lsm_oi;
927 oinfo.oi_oa->o_mode = S_IFREG;
928 oinfo.oi_oa->o_ioepoch = ioepoch;
929 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
930 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
931 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
932 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
933 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
934 OBD_MD_FLDATAVERSION;
935 oinfo.oi_capa = capa;
937 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
938 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
941 set = ptlrpc_prep_set();
943 CERROR("can't allocate ptlrpc set\n");
946 rc = obd_getattr_async(exp, &oinfo, set);
948 rc = ptlrpc_set_wait(set);
949 ptlrpc_set_destroy(set);
952 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
953 OBD_MD_FLATIME | OBD_MD_FLMTIME |
954 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
955 OBD_MD_FLDATAVERSION);
960 * Performs the getattr on the inode and updates its fields.
961 * If @sync != 0, perform the getattr under the server-side lock.
963 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
964 __u64 ioepoch, int sync)
966 struct obd_capa *capa = ll_mdscapa_get(inode);
967 struct lov_stripe_md *lsm;
970 lsm = ccc_inode_lsm_get(inode);
971 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
972 capa, obdo, ioepoch, sync);
975 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
977 obdo_refresh_inode(inode, obdo, obdo->o_valid);
978 CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
979 POSTID(oi), i_size_read(inode),
980 (unsigned long long)inode->i_blocks,
981 1UL << inode->i_blkbits);
983 ccc_inode_lsm_put(inode, lsm);
987 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
989 struct ll_inode_info *lli = ll_i2info(inode);
990 struct cl_object *obj = lli->lli_clob;
991 struct cl_attr *attr = ccc_env_thread_attr(env);
995 ll_inode_size_lock(inode);
996 /* merge timestamps the most recently obtained from mds with
997 timestamps obtained from osts */
998 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
999 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1000 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1002 lvb.lvb_size = i_size_read(inode);
1003 lvb.lvb_blocks = inode->i_blocks;
1004 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1005 lvb.lvb_atime = LTIME_S(inode->i_atime);
1006 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1008 cl_object_attr_lock(obj);
1009 rc = cl_object_attr_get(env, obj, attr);
1010 cl_object_attr_unlock(obj);
1013 if (lvb.lvb_atime < attr->cat_atime)
1014 lvb.lvb_atime = attr->cat_atime;
1015 if (lvb.lvb_ctime < attr->cat_ctime)
1016 lvb.lvb_ctime = attr->cat_ctime;
1017 if (lvb.lvb_mtime < attr->cat_mtime)
1018 lvb.lvb_mtime = attr->cat_mtime;
1020 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1021 PFID(&lli->lli_fid), attr->cat_size);
1022 cl_isize_write_nolock(inode, attr->cat_size);
1024 inode->i_blocks = attr->cat_blocks;
1026 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1027 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1028 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1030 ll_inode_size_unlock(inode);
1035 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1038 struct obdo obdo = { 0 };
1041 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1043 st->st_size = obdo.o_size;
1044 st->st_blocks = obdo.o_blocks;
1045 st->st_mtime = obdo.o_mtime;
1046 st->st_atime = obdo.o_atime;
1047 st->st_ctime = obdo.o_ctime;
1052 static bool file_is_noatime(const struct file *file)
1054 const struct vfsmount *mnt = file->f_path.mnt;
1055 const struct inode *inode = file_inode(file);
1057 /* Adapted from file_accessed() and touch_atime().*/
1058 if (file->f_flags & O_NOATIME)
1061 if (inode->i_flags & S_NOATIME)
1064 if (IS_NOATIME(inode))
1067 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1070 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1073 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1079 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1081 struct inode *inode = file_inode(file);
1083 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1085 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1086 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1087 file->f_flags & O_DIRECT ||
1090 io->ci_obj = ll_i2info(inode)->lli_clob;
1091 io->ci_lockreq = CILR_MAYBE;
1092 if (ll_file_nolock(file)) {
1093 io->ci_lockreq = CILR_NEVER;
1094 io->ci_no_srvlock = 1;
1095 } else if (file->f_flags & O_APPEND) {
1096 io->ci_lockreq = CILR_MANDATORY;
1099 io->ci_noatime = file_is_noatime(file);
1103 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1104 struct file *file, enum cl_io_type iot,
1105 loff_t *ppos, size_t count)
1107 struct ll_inode_info *lli = ll_i2info(file_inode(file));
1108 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1113 io = ccc_env_thread_io(env);
1114 ll_io_init(io, file, iot == CIT_WRITE);
1116 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1117 struct vvp_io *vio = vvp_env_io(env);
1118 struct ccc_io *cio = ccc_env_io(env);
1119 int write_mutex_locked = 0;
1121 cio->cui_fd = LUSTRE_FPRIVATE(file);
1122 vio->cui_io_subtype = args->via_io_subtype;
1124 switch (vio->cui_io_subtype) {
1126 cio->cui_iter = args->u.normal.via_iter;
1127 cio->cui_iocb = args->u.normal.via_iocb;
1128 if ((iot == CIT_WRITE) &&
1129 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1130 if (mutex_lock_interruptible(&lli->
1132 result = -ERESTARTSYS;
1135 write_mutex_locked = 1;
1136 } else if (iot == CIT_READ) {
1137 down_read(&lli->lli_trunc_sem);
1141 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1142 vio->u.splice.cui_flags = args->u.splice.via_flags;
1145 CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1148 result = cl_io_loop(env, io);
1149 if (write_mutex_locked)
1150 mutex_unlock(&lli->lli_write_mutex);
1151 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1152 up_read(&lli->lli_trunc_sem);
1154 /* cl_io_rw_init() handled IO */
1155 result = io->ci_result;
1158 if (io->ci_nob > 0) {
1159 result = io->ci_nob;
1160 *ppos = io->u.ci_wr.wr.crw_pos;
1164 cl_io_fini(env, io);
1165 /* If any bit been read/written (result != 0), we just return
1166 * short read/write instead of restart io. */
1167 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1168 CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
1169 iot == CIT_READ ? "read" : "write",
1170 file, *ppos, count);
1171 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1175 if (iot == CIT_READ) {
1177 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1178 LPROC_LL_READ_BYTES, result);
1179 } else if (iot == CIT_WRITE) {
1181 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1182 LPROC_LL_WRITE_BYTES, result);
1183 fd->fd_write_failed = false;
1184 } else if (result != -ERESTARTSYS) {
1185 fd->fd_write_failed = true;
1192 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1195 struct vvp_io_args *args;
1199 env = cl_env_get(&refcheck);
1201 return PTR_ERR(env);
1203 args = vvp_env_args(env, IO_NORMAL);
1204 args->u.normal.via_iter = to;
1205 args->u.normal.via_iocb = iocb;
1207 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1208 &iocb->ki_pos, iov_iter_count(to));
1209 cl_env_put(env, &refcheck);
1214 * Write to a file (through the page cache).
1216 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1219 struct vvp_io_args *args;
1223 env = cl_env_get(&refcheck);
1225 return PTR_ERR(env);
1227 args = vvp_env_args(env, IO_NORMAL);
1228 args->u.normal.via_iter = from;
1229 args->u.normal.via_iocb = iocb;
1231 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1232 &iocb->ki_pos, iov_iter_count(from));
1233 cl_env_put(env, &refcheck);
1238 * Send file content (through pagecache) somewhere with helper
1240 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1241 struct pipe_inode_info *pipe, size_t count,
1245 struct vvp_io_args *args;
1249 env = cl_env_get(&refcheck);
1251 return PTR_ERR(env);
1253 args = vvp_env_args(env, IO_SPLICE);
1254 args->u.splice.via_pipe = pipe;
1255 args->u.splice.via_flags = flags;
1257 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1258 cl_env_put(env, &refcheck);
1262 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1264 struct obd_export *exp = ll_i2dtexp(inode);
1265 struct obd_trans_info oti = { 0 };
1266 struct obdo *oa = NULL;
1269 struct lov_stripe_md *lsm = NULL, *lsm2;
1275 lsm = ccc_inode_lsm_get(inode);
1276 if (!lsm_has_objects(lsm)) {
1281 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1282 (lsm->lsm_stripe_count));
1284 OBD_ALLOC_LARGE(lsm2, lsm_size);
1291 oa->o_nlink = ost_idx;
1292 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1293 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1294 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1295 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1296 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1297 memcpy(lsm2, lsm, lsm_size);
1298 ll_inode_size_lock(inode);
1299 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1300 ll_inode_size_unlock(inode);
1302 OBD_FREE_LARGE(lsm2, lsm_size);
1305 ccc_inode_lsm_put(inode, lsm);
1310 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1312 struct ll_recreate_obj ucreat;
1315 if (!capable(CFS_CAP_SYS_ADMIN))
1318 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1322 ostid_set_seq_mdt0(&oi);
1323 ostid_set_id(&oi, ucreat.lrc_id);
1324 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1327 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1333 if (!capable(CFS_CAP_SYS_ADMIN))
1336 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1339 fid_to_ostid(&fid, &oi);
1340 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1341 return ll_lov_recreate(inode, &oi, ost_idx);
1344 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1345 int flags, struct lov_user_md *lum, int lum_size)
1347 struct lov_stripe_md *lsm = NULL;
1348 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1351 lsm = ccc_inode_lsm_get(inode);
1353 ccc_inode_lsm_put(inode, lsm);
1354 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1360 ll_inode_size_lock(inode);
1361 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1364 rc = oit.d.lustre.it_status;
1368 ll_release_openhandle(inode, &oit);
1371 ll_inode_size_unlock(inode);
1372 ll_intent_release(&oit);
1373 ccc_inode_lsm_put(inode, lsm);
1377 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1381 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1382 struct lov_mds_md **lmmp, int *lmm_size,
1383 struct ptlrpc_request **request)
1385 struct ll_sb_info *sbi = ll_i2sbi(inode);
1386 struct mdt_body *body;
1387 struct lov_mds_md *lmm = NULL;
1388 struct ptlrpc_request *req = NULL;
1389 struct md_op_data *op_data;
1392 rc = ll_get_default_mdsize(sbi, &lmmsize);
1396 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1397 strlen(filename), lmmsize,
1398 LUSTRE_OPC_ANY, NULL);
1399 if (IS_ERR(op_data))
1400 return PTR_ERR(op_data);
1402 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1403 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1404 ll_finish_md_op_data(op_data);
1406 CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1411 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1412 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1414 lmmsize = body->eadatasize;
1416 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1422 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1423 LASSERT(lmm != NULL);
1425 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1426 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1432 * This is coming from the MDS, so is probably in
1433 * little endian. We convert it to host endian before
1434 * passing it to userspace.
1436 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1439 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1440 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1443 /* if function called for directory - we should
1444 * avoid swab not existent lsm objects */
1445 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1446 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1447 if (S_ISREG(body->mode))
1448 lustre_swab_lov_user_md_objects(
1449 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1451 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1452 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1453 if (S_ISREG(body->mode))
1454 lustre_swab_lov_user_md_objects(
1455 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1462 *lmm_size = lmmsize;
1467 static int ll_lov_setea(struct inode *inode, struct file *file,
1470 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1471 struct lov_user_md *lump;
1472 int lum_size = sizeof(struct lov_user_md) +
1473 sizeof(struct lov_user_ost_data);
1476 if (!capable(CFS_CAP_SYS_ADMIN))
1479 OBD_ALLOC_LARGE(lump, lum_size);
1483 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1484 OBD_FREE_LARGE(lump, lum_size);
1488 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1490 cl_lov_delay_create_clear(&file->f_flags);
1492 OBD_FREE_LARGE(lump, lum_size);
1496 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1499 struct lov_user_md_v3 lumv3;
1500 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1501 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1502 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1504 int flags = FMODE_WRITE;
1506 /* first try with v1 which is smaller than v3 */
1507 lum_size = sizeof(struct lov_user_md_v1);
1508 if (copy_from_user(lumv1, lumv1p, lum_size))
1511 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1512 lum_size = sizeof(struct lov_user_md_v3);
1513 if (copy_from_user(&lumv3, lumv3p, lum_size))
1517 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1519 cl_lov_delay_create_clear(&file->f_flags);
1521 struct lov_stripe_md *lsm;
1524 put_user(0, &lumv1p->lmm_stripe_count);
1526 ll_layout_refresh(inode, &gen);
1527 lsm = ccc_inode_lsm_get(inode);
1528 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1529 0, lsm, (void *)arg);
1530 ccc_inode_lsm_put(inode, lsm);
1535 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1537 struct lov_stripe_md *lsm;
1540 lsm = ccc_inode_lsm_get(inode);
1542 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1544 ccc_inode_lsm_put(inode, lsm);
1549 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1551 struct ll_inode_info *lli = ll_i2info(inode);
1552 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1553 struct ccc_grouplock grouplock;
1556 if (ll_file_nolock(file))
1559 spin_lock(&lli->lli_lock);
1560 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1561 CWARN("group lock already existed with gid %lu\n",
1562 fd->fd_grouplock.cg_gid);
1563 spin_unlock(&lli->lli_lock);
1566 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1567 spin_unlock(&lli->lli_lock);
1569 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1570 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1574 spin_lock(&lli->lli_lock);
1575 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1576 spin_unlock(&lli->lli_lock);
1577 CERROR("another thread just won the race\n");
1578 cl_put_grouplock(&grouplock);
1582 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1583 fd->fd_grouplock = grouplock;
1584 spin_unlock(&lli->lli_lock);
1586 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1590 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1592 struct ll_inode_info *lli = ll_i2info(inode);
1593 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1594 struct ccc_grouplock grouplock;
1596 spin_lock(&lli->lli_lock);
1597 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1598 spin_unlock(&lli->lli_lock);
1599 CWARN("no group lock held\n");
1602 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1604 if (fd->fd_grouplock.cg_gid != arg) {
1605 CWARN("group lock %lu doesn't match current id %lu\n",
1606 arg, fd->fd_grouplock.cg_gid);
1607 spin_unlock(&lli->lli_lock);
1611 grouplock = fd->fd_grouplock;
1612 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1613 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1614 spin_unlock(&lli->lli_lock);
1616 cl_put_grouplock(&grouplock);
1617 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1622 * Close inode open handle
1624 * \param inode [in] inode in question
1625 * \param it [in,out] intent which contains open info and result
1628 * \retval <0 failure
1630 int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
1632 struct obd_client_handle *och;
1637 /* Root ? Do nothing. */
1638 if (is_root_inode(inode))
1641 /* No open handle to close? Move away */
1642 if (!it_disposition(it, DISP_OPEN_OPEN))
1645 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1647 och = kzalloc(sizeof(*och), GFP_NOFS);
1653 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1655 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1658 /* this one is in place of ll_file_open */
1659 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1660 ptlrpc_req_finished(it->d.lustre.it_data);
1661 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1667 * Get size for inode for which FIEMAP mapping is requested.
1668 * Make the FIEMAP get_info call and returns the result.
1670 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1673 struct obd_export *exp = ll_i2dtexp(inode);
1674 struct lov_stripe_md *lsm = NULL;
1675 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1676 __u32 vallen = num_bytes;
1679 /* Checks for fiemap flags */
1680 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1681 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1685 /* Check for FIEMAP_FLAG_SYNC */
1686 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1687 rc = filemap_fdatawrite(inode->i_mapping);
1692 lsm = ccc_inode_lsm_get(inode);
1696 /* If the stripe_count > 1 and the application does not understand
1697 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1699 if (lsm->lsm_stripe_count > 1 &&
1700 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1705 fm_key.oa.o_oi = lsm->lsm_oi;
1706 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1708 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1709 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1710 /* If filesize is 0, then there would be no objects for mapping */
1711 if (fm_key.oa.o_size == 0) {
1712 fiemap->fm_mapped_extents = 0;
1717 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1719 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1722 CERROR("obd_get_info failed: rc = %d\n", rc);
1725 ccc_inode_lsm_put(inode, lsm);
1729 int ll_fid2path(struct inode *inode, void __user *arg)
1731 struct obd_export *exp = ll_i2mdexp(inode);
1732 const struct getinfo_fid2path __user *gfin = arg;
1733 struct getinfo_fid2path *gfout;
1738 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1739 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1742 /* Only need to get the buflen */
1743 if (get_user(pathlen, &gfin->gf_pathlen))
1746 if (pathlen > PATH_MAX)
1749 outsize = sizeof(*gfout) + pathlen;
1751 gfout = kzalloc(outsize, GFP_NOFS);
1755 if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1760 /* Call mdc_iocontrol */
1761 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1765 if (copy_to_user(arg, gfout, outsize))
1769 OBD_FREE(gfout, outsize);
1773 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1775 struct ll_user_fiemap *fiemap_s;
1776 size_t num_bytes, ret_bytes;
1777 unsigned int extent_count;
1780 /* Get the extent count so we can calculate the size of
1781 * required fiemap buffer */
1782 if (get_user(extent_count,
1783 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1787 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1789 num_bytes = sizeof(*fiemap_s) + (extent_count *
1790 sizeof(struct ll_fiemap_extent));
1792 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1793 if (fiemap_s == NULL)
1796 /* get the fiemap value */
1797 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1798 sizeof(*fiemap_s))) {
1803 /* If fm_extent_count is non-zero, read the first extent since
1804 * it is used to calculate end_offset and device from previous
1807 if (copy_from_user(&fiemap_s->fm_extents[0],
1808 (char __user *)arg + sizeof(*fiemap_s),
1809 sizeof(struct ll_fiemap_extent))) {
1815 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1819 ret_bytes = sizeof(struct ll_user_fiemap);
1821 if (extent_count != 0)
1822 ret_bytes += (fiemap_s->fm_mapped_extents *
1823 sizeof(struct ll_fiemap_extent));
1825 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1829 OBD_FREE_LARGE(fiemap_s, num_bytes);
1834 * Read the data_version for inode.
1836 * This value is computed using stripe object version on OST.
1837 * Version is computed using server side locking.
1839 * @param extent_lock Take extent lock. Not needed if a process is already
1840 * holding the OST object group locks.
1842 int ll_data_version(struct inode *inode, __u64 *data_version,
1845 struct lov_stripe_md *lsm = NULL;
1846 struct ll_sb_info *sbi = ll_i2sbi(inode);
1847 struct obdo *obdo = NULL;
1850 /* If no stripe, we consider version is 0. */
1851 lsm = ccc_inode_lsm_get(inode);
1852 if (!lsm_has_objects(lsm)) {
1854 CDEBUG(D_INODE, "No object for inode\n");
1859 obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1865 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1867 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1870 *data_version = obdo->o_data_version;
1875 ccc_inode_lsm_put(inode, lsm);
1880 * Trigger a HSM release request for the provided inode.
1882 int ll_hsm_release(struct inode *inode)
1884 struct cl_env_nest nest;
1886 struct obd_client_handle *och = NULL;
1887 __u64 data_version = 0;
1891 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1892 ll_get_fsname(inode->i_sb, NULL, 0),
1893 PFID(&ll_i2info(inode)->lli_fid));
1895 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1901 /* Grab latest data_version and [am]time values */
1902 rc = ll_data_version(inode, &data_version, 1);
1906 env = cl_env_nested_get(&nest);
1912 ll_merge_lvb(env, inode);
1913 cl_env_nested_put(&nest, env);
1915 /* Release the file.
1916 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1917 * we still need it to pack l_remote_handle to MDT. */
1918 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1924 if (och != NULL && !IS_ERR(och)) /* close the file */
1925 ll_lease_close(och, inode, NULL);
1930 struct ll_swap_stack {
1931 struct iattr ia1, ia2;
1933 struct inode *inode1, *inode2;
1934 bool check_dv1, check_dv2;
1937 static int ll_swap_layouts(struct file *file1, struct file *file2,
1938 struct lustre_swap_layouts *lsl)
1940 struct mdc_swap_layouts msl;
1941 struct md_op_data *op_data;
1944 struct ll_swap_stack *llss = NULL;
1947 llss = kzalloc(sizeof(*llss), GFP_NOFS);
1951 llss->inode1 = file_inode(file1);
1952 llss->inode2 = file_inode(file2);
1954 if (!S_ISREG(llss->inode2->i_mode)) {
1959 if (inode_permission(llss->inode1, MAY_WRITE) ||
1960 inode_permission(llss->inode2, MAY_WRITE)) {
1965 if (llss->inode2->i_sb != llss->inode1->i_sb) {
1970 /* we use 2 bool because it is easier to swap than 2 bits */
1971 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1972 llss->check_dv1 = true;
1974 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1975 llss->check_dv2 = true;
1977 /* we cannot use lsl->sl_dvX directly because we may swap them */
1978 llss->dv1 = lsl->sl_dv1;
1979 llss->dv2 = lsl->sl_dv2;
1981 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1982 if (rc == 0) /* same file, done! */ {
1987 if (rc < 0) { /* sequentialize it */
1988 swap(llss->inode1, llss->inode2);
1990 swap(llss->dv1, llss->dv2);
1991 swap(llss->check_dv1, llss->check_dv2);
1995 if (gid != 0) { /* application asks to flush dirty cache */
1996 rc = ll_get_grouplock(llss->inode1, file1, gid);
2000 rc = ll_get_grouplock(llss->inode2, file2, gid);
2002 ll_put_grouplock(llss->inode1, file1, gid);
2007 /* to be able to restore mtime and atime after swap
2008 * we need to first save them */
2010 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2011 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2012 llss->ia1.ia_atime = llss->inode1->i_atime;
2013 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2014 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2015 llss->ia2.ia_atime = llss->inode2->i_atime;
2016 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2019 /* ultimate check, before swapping the layouts we check if
2020 * dataversion has changed (if requested) */
2021 if (llss->check_dv1) {
2022 rc = ll_data_version(llss->inode1, &dv, 0);
2025 if (dv != llss->dv1) {
2031 if (llss->check_dv2) {
2032 rc = ll_data_version(llss->inode2, &dv, 0);
2035 if (dv != llss->dv2) {
2041 /* struct md_op_data is used to send the swap args to the mdt
2042 * only flags is missing, so we use struct mdc_swap_layouts
2043 * through the md_op_data->op_data */
2044 /* flags from user space have to be converted before they are send to
2045 * server, no flag is sent today, they are only used on the client */
2048 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2049 0, LUSTRE_OPC_ANY, &msl);
2050 if (IS_ERR(op_data)) {
2051 rc = PTR_ERR(op_data);
2055 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2056 sizeof(*op_data), op_data, NULL);
2057 ll_finish_md_op_data(op_data);
2061 ll_put_grouplock(llss->inode2, file2, gid);
2062 ll_put_grouplock(llss->inode1, file1, gid);
2065 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2069 /* clear useless flags */
2070 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2071 llss->ia1.ia_valid &= ~ATTR_MTIME;
2072 llss->ia2.ia_valid &= ~ATTR_MTIME;
2075 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2076 llss->ia1.ia_valid &= ~ATTR_ATIME;
2077 llss->ia2.ia_valid &= ~ATTR_ATIME;
2080 /* update time if requested */
2082 if (llss->ia2.ia_valid != 0) {
2083 mutex_lock(&llss->inode1->i_mutex);
2084 rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
2085 mutex_unlock(&llss->inode1->i_mutex);
2088 if (llss->ia1.ia_valid != 0) {
2091 mutex_lock(&llss->inode2->i_mutex);
2092 rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
2093 mutex_unlock(&llss->inode2->i_mutex);
2105 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2107 struct md_op_data *op_data;
2110 /* Non-root users are forbidden to set or clear flags which are
2111 * NOT defined in HSM_USER_MASK. */
2112 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2113 !capable(CFS_CAP_SYS_ADMIN))
2116 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2117 LUSTRE_OPC_ANY, hss);
2118 if (IS_ERR(op_data))
2119 return PTR_ERR(op_data);
2121 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2122 sizeof(*op_data), op_data, NULL);
2124 ll_finish_md_op_data(op_data);
2129 static int ll_hsm_import(struct inode *inode, struct file *file,
2130 struct hsm_user_import *hui)
2132 struct hsm_state_set *hss = NULL;
2133 struct iattr *attr = NULL;
2137 if (!S_ISREG(inode->i_mode))
2141 hss = kzalloc(sizeof(*hss), GFP_NOFS);
2147 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2148 hss->hss_archive_id = hui->hui_archive_id;
2149 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2150 rc = ll_hsm_state_set(inode, hss);
2154 attr = kzalloc(sizeof(*attr), GFP_NOFS);
2160 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2161 attr->ia_mode |= S_IFREG;
2162 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2163 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2164 attr->ia_size = hui->hui_size;
2165 attr->ia_mtime.tv_sec = hui->hui_mtime;
2166 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2167 attr->ia_atime.tv_sec = hui->hui_atime;
2168 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2170 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2171 ATTR_UID | ATTR_GID |
2172 ATTR_MTIME | ATTR_MTIME_SET |
2173 ATTR_ATIME | ATTR_ATIME_SET;
2175 mutex_lock(&inode->i_mutex);
2177 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2181 mutex_unlock(&inode->i_mutex);
2194 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2196 struct inode *inode = file_inode(file);
2197 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2200 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2201 inode->i_generation, inode, cmd);
2202 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2204 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2205 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2209 case LL_IOC_GETFLAGS:
2210 /* Get the current value of the file flags */
2211 return put_user(fd->fd_flags, (int *)arg);
2212 case LL_IOC_SETFLAGS:
2213 case LL_IOC_CLRFLAGS:
2214 /* Set or clear specific file flags */
2215 /* XXX This probably needs checks to ensure the flags are
2216 * not abused, and to handle any flag side effects.
2218 if (get_user(flags, (int *) arg))
2221 if (cmd == LL_IOC_SETFLAGS) {
2222 if ((flags & LL_FILE_IGNORE_LOCK) &&
2223 !(file->f_flags & O_DIRECT)) {
2224 CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2229 fd->fd_flags |= flags;
2231 fd->fd_flags &= ~flags;
2234 case LL_IOC_LOV_SETSTRIPE:
2235 return ll_lov_setstripe(inode, file, arg);
2236 case LL_IOC_LOV_SETEA:
2237 return ll_lov_setea(inode, file, arg);
2238 case LL_IOC_LOV_SWAP_LAYOUTS: {
2240 struct lustre_swap_layouts lsl;
2242 if (copy_from_user(&lsl, (char *)arg,
2243 sizeof(struct lustre_swap_layouts)))
2246 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2249 file2 = fget(lsl.sl_fd);
2254 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2255 rc = ll_swap_layouts(file, file2, &lsl);
2259 case LL_IOC_LOV_GETSTRIPE:
2260 return ll_lov_getstripe(inode, arg);
2261 case LL_IOC_RECREATE_OBJ:
2262 return ll_lov_recreate_obj(inode, arg);
2263 case LL_IOC_RECREATE_FID:
2264 return ll_lov_recreate_fid(inode, arg);
2265 case FSFILT_IOC_FIEMAP:
2266 return ll_ioctl_fiemap(inode, arg);
2267 case FSFILT_IOC_GETFLAGS:
2268 case FSFILT_IOC_SETFLAGS:
2269 return ll_iocontrol(inode, file, cmd, arg);
2270 case FSFILT_IOC_GETVERSION_OLD:
2271 case FSFILT_IOC_GETVERSION:
2272 return put_user(inode->i_generation, (int *)arg);
2273 case LL_IOC_GROUP_LOCK:
2274 return ll_get_grouplock(inode, file, arg);
2275 case LL_IOC_GROUP_UNLOCK:
2276 return ll_put_grouplock(inode, file, arg);
2277 case IOC_OBD_STATFS:
2278 return ll_obd_statfs(inode, (void *)arg);
2280 /* We need to special case any other ioctls we want to handle,
2281 * to send them to the MDS/OST as appropriate and to properly
2282 * network encode the arg field.
2283 case FSFILT_IOC_SETVERSION_OLD:
2284 case FSFILT_IOC_SETVERSION:
2286 case LL_IOC_FLUSHCTX:
2287 return ll_flush_ctx(inode);
2288 case LL_IOC_PATH2FID: {
2289 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2290 sizeof(struct lu_fid)))
2295 case OBD_IOC_FID2PATH:
2296 return ll_fid2path(inode, (void *)arg);
2297 case LL_IOC_DATA_VERSION: {
2298 struct ioc_data_version idv;
2301 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2304 rc = ll_data_version(inode, &idv.idv_version,
2305 !(idv.idv_flags & LL_DV_NOFLUSH));
2307 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2313 case LL_IOC_GET_MDTIDX: {
2316 mdtidx = ll_get_mdt_idx(inode);
2320 if (put_user((int)mdtidx, (int *)arg))
2325 case OBD_IOC_GETDTNAME:
2326 case OBD_IOC_GETMDNAME:
2327 return ll_get_obd_name(inode, cmd, arg);
2328 case LL_IOC_HSM_STATE_GET: {
2329 struct md_op_data *op_data;
2330 struct hsm_user_state *hus;
2333 hus = kzalloc(sizeof(*hus), GFP_NOFS);
2337 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2338 LUSTRE_OPC_ANY, hus);
2339 if (IS_ERR(op_data)) {
2341 return PTR_ERR(op_data);
2344 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2347 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2350 ll_finish_md_op_data(op_data);
2354 case LL_IOC_HSM_STATE_SET: {
2355 struct hsm_state_set *hss;
2358 hss = kzalloc(sizeof(*hss), GFP_NOFS);
2362 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2367 rc = ll_hsm_state_set(inode, hss);
2372 case LL_IOC_HSM_ACTION: {
2373 struct md_op_data *op_data;
2374 struct hsm_current_action *hca;
2377 hca = kzalloc(sizeof(*hca), GFP_NOFS);
2381 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2382 LUSTRE_OPC_ANY, hca);
2383 if (IS_ERR(op_data)) {
2385 return PTR_ERR(op_data);
2388 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2391 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2394 ll_finish_md_op_data(op_data);
2398 case LL_IOC_SET_LEASE: {
2399 struct ll_inode_info *lli = ll_i2info(inode);
2400 struct obd_client_handle *och = NULL;
2406 if (!(file->f_mode & FMODE_WRITE))
2411 if (!(file->f_mode & FMODE_READ))
2416 mutex_lock(&lli->lli_och_mutex);
2417 if (fd->fd_lease_och != NULL) {
2418 och = fd->fd_lease_och;
2419 fd->fd_lease_och = NULL;
2421 mutex_unlock(&lli->lli_och_mutex);
2424 mode = och->och_flags &
2425 (FMODE_READ|FMODE_WRITE);
2426 rc = ll_lease_close(och, inode, &lease_broken);
2427 if (rc == 0 && lease_broken)
2433 /* return the type of lease or error */
2434 return rc < 0 ? rc : (int)mode;
2439 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2441 /* apply for lease */
2442 och = ll_lease_open(inode, file, mode, 0);
2444 return PTR_ERR(och);
2447 mutex_lock(&lli->lli_och_mutex);
2448 if (fd->fd_lease_och == NULL) {
2449 fd->fd_lease_och = och;
2452 mutex_unlock(&lli->lli_och_mutex);
2454 /* impossible now that only excl is supported for now */
2455 ll_lease_close(och, inode, &lease_broken);
2460 case LL_IOC_GET_LEASE: {
2461 struct ll_inode_info *lli = ll_i2info(inode);
2462 struct ldlm_lock *lock = NULL;
2465 mutex_lock(&lli->lli_och_mutex);
2466 if (fd->fd_lease_och != NULL) {
2467 struct obd_client_handle *och = fd->fd_lease_och;
2469 lock = ldlm_handle2lock(&och->och_lease_handle);
2471 lock_res_and_lock(lock);
2472 if (!ldlm_is_cancel(lock))
2473 rc = och->och_flags &
2474 (FMODE_READ | FMODE_WRITE);
2475 unlock_res_and_lock(lock);
2476 ldlm_lock_put(lock);
2479 mutex_unlock(&lli->lli_och_mutex);
2482 case LL_IOC_HSM_IMPORT: {
2483 struct hsm_user_import *hui;
2485 hui = kzalloc(sizeof(*hui), GFP_NOFS);
2489 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2494 rc = ll_hsm_import(inode, file, hui);
2503 ll_iocontrol_call(inode, file, cmd, arg, &err))
2506 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2513 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2515 struct inode *inode = file_inode(file);
2516 loff_t retval, eof = 0;
2518 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2519 (origin == SEEK_CUR) ? file->f_pos : 0);
2520 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2521 inode->i_ino, inode->i_generation, inode, retval, retval,
2523 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2525 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2526 retval = ll_glimpse_size(inode);
2529 eof = i_size_read(inode);
2532 retval = generic_file_llseek_size(file, offset, origin,
2533 ll_file_maxbytes(inode), eof);
2537 static int ll_flush(struct file *file, fl_owner_t id)
2539 struct inode *inode = file_inode(file);
2540 struct ll_inode_info *lli = ll_i2info(inode);
2541 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2544 LASSERT(!S_ISDIR(inode->i_mode));
2546 /* catch async errors that were recorded back when async writeback
2547 * failed for pages in this mapping. */
2548 rc = lli->lli_async_rc;
2549 lli->lli_async_rc = 0;
2550 err = lov_read_and_clear_async_rc(lli->lli_clob);
2554 /* The application has been told write failure already.
2555 * Do not report failure again. */
2556 if (fd->fd_write_failed)
2558 return rc ? -EIO : 0;
2562 * Called to make sure a portion of file has been written out.
2563 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2565 * Return how many pages have been written.
2567 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2568 enum cl_fsync_mode mode, int ignore_layout)
2570 struct cl_env_nest nest;
2573 struct obd_capa *capa = NULL;
2574 struct cl_fsync_io *fio;
2577 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2578 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2581 env = cl_env_nested_get(&nest);
2583 return PTR_ERR(env);
2585 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2587 io = ccc_env_thread_io(env);
2588 io->ci_obj = cl_i2info(inode)->lli_clob;
2589 io->ci_ignore_layout = ignore_layout;
2591 /* initialize parameters for sync */
2592 fio = &io->u.ci_fsync;
2593 fio->fi_capa = capa;
2594 fio->fi_start = start;
2596 fio->fi_fid = ll_inode2fid(inode);
2597 fio->fi_mode = mode;
2598 fio->fi_nr_written = 0;
2600 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2601 result = cl_io_loop(env, io);
2603 result = io->ci_result;
2605 result = fio->fi_nr_written;
2606 cl_io_fini(env, io);
2607 cl_env_nested_put(&nest, env);
2614 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2616 struct inode *inode = file_inode(file);
2617 struct ll_inode_info *lli = ll_i2info(inode);
2618 struct ptlrpc_request *req;
2619 struct obd_capa *oc;
2622 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2623 inode->i_generation, inode);
2624 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2626 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2627 mutex_lock(&inode->i_mutex);
2629 /* catch async errors that were recorded back when async writeback
2630 * failed for pages in this mapping. */
2631 if (!S_ISDIR(inode->i_mode)) {
2632 err = lli->lli_async_rc;
2633 lli->lli_async_rc = 0;
2636 err = lov_read_and_clear_async_rc(lli->lli_clob);
2641 oc = ll_mdscapa_get(inode);
2642 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2648 ptlrpc_req_finished(req);
2650 if (S_ISREG(inode->i_mode)) {
2651 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2653 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2654 if (rc == 0 && err < 0)
2657 fd->fd_write_failed = true;
2659 fd->fd_write_failed = false;
2662 mutex_unlock(&inode->i_mutex);
2667 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2669 struct inode *inode = file_inode(file);
2670 struct ll_sb_info *sbi = ll_i2sbi(inode);
2671 struct ldlm_enqueue_info einfo = {
2672 .ei_type = LDLM_FLOCK,
2673 .ei_cb_cp = ldlm_flock_completion_ast,
2674 .ei_cbdata = file_lock,
2676 struct md_op_data *op_data;
2677 struct lustre_handle lockh = {0};
2678 ldlm_policy_data_t flock = {{0}};
2683 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2684 inode->i_ino, file_lock);
2686 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2688 if (file_lock->fl_flags & FL_FLOCK)
2689 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2690 else if (!(file_lock->fl_flags & FL_POSIX))
2693 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2694 flock.l_flock.pid = file_lock->fl_pid;
2695 flock.l_flock.start = file_lock->fl_start;
2696 flock.l_flock.end = file_lock->fl_end;
2698 /* Somewhat ugly workaround for svc lockd.
2699 * lockd installs custom fl_lmops->lm_compare_owner that checks
2700 * for the fl_owner to be the same (which it always is on local node
2701 * I guess between lockd processes) and then compares pid.
2702 * As such we assign pid to the owner field to make it all work,
2703 * conflict with normal locks is unlikely since pid space and
2704 * pointer space for current->files are not intersecting */
2705 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2706 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2708 switch (file_lock->fl_type) {
2710 einfo.ei_mode = LCK_PR;
2713 /* An unlock request may or may not have any relation to
2714 * existing locks so we may not be able to pass a lock handle
2715 * via a normal ldlm_lock_cancel() request. The request may even
2716 * unlock a byte range in the middle of an existing lock. In
2717 * order to process an unlock request we need all of the same
2718 * information that is given with a normal read or write record
2719 * lock request. To avoid creating another ldlm unlock (cancel)
2720 * message we'll treat a LCK_NL flock request as an unlock. */
2721 einfo.ei_mode = LCK_NL;
2724 einfo.ei_mode = LCK_PW;
2727 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2728 file_lock->fl_type);
2743 flags = LDLM_FL_BLOCK_NOWAIT;
2749 flags = LDLM_FL_TEST_LOCK;
2750 /* Save the old mode so that if the mode in the lock changes we
2751 * can decrement the appropriate reader or writer refcount. */
2752 file_lock->fl_type = einfo.ei_mode;
2755 CERROR("unknown fcntl lock command: %d\n", cmd);
2759 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2760 LUSTRE_OPC_ANY, NULL);
2761 if (IS_ERR(op_data))
2762 return PTR_ERR(op_data);
2764 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2765 inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2766 flock.l_flock.start, flock.l_flock.end);
2768 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2769 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2771 if ((file_lock->fl_flags & FL_FLOCK) &&
2772 (rc == 0 || file_lock->fl_type == F_UNLCK))
2773 rc2 = flock_lock_file_wait(file, file_lock);
2774 if ((file_lock->fl_flags & FL_POSIX) &&
2775 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2776 !(flags & LDLM_FL_TEST_LOCK))
2777 rc2 = posix_lock_file_wait(file, file_lock);
2779 if (rc2 && file_lock->fl_type != F_UNLCK) {
2780 einfo.ei_mode = LCK_NL;
2781 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2782 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2786 ll_finish_md_op_data(op_data);
2792 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2798 * test if some locks matching bits and l_req_mode are acquired
2799 * - bits can be in different locks
2800 * - if found clear the common lock bits in *bits
2801 * - the bits not found, are kept in *bits
2803 * \param bits [IN] searched lock bits [IN]
2804 * \param l_req_mode [IN] searched lock mode
2805 * \retval boolean, true iff all bits are found
2807 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2809 struct lustre_handle lockh;
2810 ldlm_policy_data_t policy;
2811 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2812 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2820 fid = &ll_i2info(inode)->lli_fid;
2821 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2822 ldlm_lockname[mode]);
2824 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2825 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2826 policy.l_inodebits.bits = *bits & (1 << i);
2827 if (policy.l_inodebits.bits == 0)
2830 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2831 &policy, mode, &lockh)) {
2832 struct ldlm_lock *lock;
2834 lock = ldlm_handle2lock(&lockh);
2837 ~(lock->l_policy_data.l_inodebits.bits);
2838 LDLM_LOCK_PUT(lock);
2840 *bits &= ~policy.l_inodebits.bits;
2847 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2848 struct lustre_handle *lockh, __u64 flags,
2851 ldlm_policy_data_t policy = { .l_inodebits = {bits} };
2855 fid = &ll_i2info(inode)->lli_fid;
2856 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2858 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2859 fid, LDLM_IBITS, &policy, mode, lockh);
2864 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2866 /* Already unlinked. Just update nlink and return success */
2867 if (rc == -ENOENT) {
2869 /* This path cannot be hit for regular files unless in
2870 * case of obscure races, so no need to validate size.
2872 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2874 } else if (rc != 0) {
2875 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2876 "%s: revalidate FID "DFID" error: rc = %d\n",
2877 ll_get_fsname(inode->i_sb, NULL, 0),
2878 PFID(ll_inode2fid(inode)), rc);
2884 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2886 struct inode *inode = dentry->d_inode;
2887 struct ptlrpc_request *req = NULL;
2888 struct obd_export *exp;
2891 LASSERT(inode != NULL);
2893 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2894 inode->i_ino, inode->i_generation, inode, dentry);
2896 exp = ll_i2mdexp(inode);
2898 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2899 * But under CMD case, it caused some lock issues, should be fixed
2900 * with new CMD ibits lock. See bug 12718 */
2901 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2902 struct lookup_intent oit = { .it_op = IT_GETATTR };
2903 struct md_op_data *op_data;
2905 if (ibits == MDS_INODELOCK_LOOKUP)
2906 oit.it_op = IT_LOOKUP;
2908 /* Call getattr by fid, so do not provide name at all. */
2909 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
2910 dentry->d_inode, NULL, 0, 0,
2911 LUSTRE_OPC_ANY, NULL);
2912 if (IS_ERR(op_data))
2913 return PTR_ERR(op_data);
2915 oit.it_create_mode |= M_CHECK_STALE;
2916 rc = md_intent_lock(exp, op_data, NULL, 0,
2917 /* we are not interested in name
2920 ll_md_blocking_ast, 0);
2921 ll_finish_md_op_data(op_data);
2922 oit.it_create_mode &= ~M_CHECK_STALE;
2924 rc = ll_inode_revalidate_fini(inode, rc);
2928 rc = ll_revalidate_it_finish(req, &oit, dentry);
2930 ll_intent_release(&oit);
2934 /* Unlinked? Unhash dentry, so it is not picked up later by
2935 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2936 here to preserve get_cwd functionality on 2.6.
2938 if (!dentry->d_inode->i_nlink)
2939 d_lustre_invalidate(dentry, 0);
2941 ll_lookup_finish_locks(&oit, dentry);
2942 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2943 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2944 u64 valid = OBD_MD_FLGETATTR;
2945 struct md_op_data *op_data;
2948 if (S_ISREG(inode->i_mode)) {
2949 rc = ll_get_default_mdsize(sbi, &ealen);
2952 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2955 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2956 0, ealen, LUSTRE_OPC_ANY,
2958 if (IS_ERR(op_data))
2959 return PTR_ERR(op_data);
2961 op_data->op_valid = valid;
2962 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2963 * capa for this inode. Because we only keep capas of dirs
2965 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2966 ll_finish_md_op_data(op_data);
2968 rc = ll_inode_revalidate_fini(inode, rc);
2972 rc = ll_prep_inode(&inode, req, NULL, NULL);
2975 ptlrpc_req_finished(req);
2979 static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2981 struct inode *inode = dentry->d_inode;
2984 rc = __ll_inode_revalidate(dentry, ibits);
2988 /* if object isn't regular file, don't validate size */
2989 if (!S_ISREG(inode->i_mode)) {
2990 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2991 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2992 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2994 /* In case of restore, the MDT has the right size and has
2995 * already send it back without granting the layout lock,
2996 * inode is up-to-date so glimpse is useless.
2997 * Also to glimpse we need the layout, in case of a running
2998 * restore the MDT holds the layout lock so the glimpse will
2999 * block up to the end of restore (getattr will block)
3001 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3002 rc = ll_glimpse_size(inode);
3007 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3009 struct inode *inode = de->d_inode;
3010 struct ll_sb_info *sbi = ll_i2sbi(inode);
3011 struct ll_inode_info *lli = ll_i2info(inode);
3014 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3015 MDS_INODELOCK_LOOKUP);
3016 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3021 stat->dev = inode->i_sb->s_dev;
3022 if (ll_need_32bit_api(sbi))
3023 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3025 stat->ino = inode->i_ino;
3026 stat->mode = inode->i_mode;
3027 stat->nlink = inode->i_nlink;
3028 stat->uid = inode->i_uid;
3029 stat->gid = inode->i_gid;
3030 stat->rdev = inode->i_rdev;
3031 stat->atime = inode->i_atime;
3032 stat->mtime = inode->i_mtime;
3033 stat->ctime = inode->i_ctime;
3034 stat->blksize = 1 << inode->i_blkbits;
3036 stat->size = i_size_read(inode);
3037 stat->blocks = inode->i_blocks;
3042 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3043 __u64 start, __u64 len)
3047 struct ll_user_fiemap *fiemap;
3048 unsigned int extent_count = fieinfo->fi_extents_max;
3050 num_bytes = sizeof(*fiemap) + (extent_count *
3051 sizeof(struct ll_fiemap_extent));
3052 OBD_ALLOC_LARGE(fiemap, num_bytes);
3057 fiemap->fm_flags = fieinfo->fi_flags;
3058 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3059 fiemap->fm_start = start;
3060 fiemap->fm_length = len;
3061 if (extent_count > 0)
3062 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3063 sizeof(struct ll_fiemap_extent));
3065 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3067 fieinfo->fi_flags = fiemap->fm_flags;
3068 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3069 if (extent_count > 0)
3070 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3071 fiemap->fm_mapped_extents *
3072 sizeof(struct ll_fiemap_extent));
3074 OBD_FREE_LARGE(fiemap, num_bytes);
3078 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3080 struct ll_inode_info *lli = ll_i2info(inode);
3081 struct posix_acl *acl = NULL;
3083 spin_lock(&lli->lli_lock);
3084 /* VFS' acl_permission_check->check_acl will release the refcount */
3085 acl = posix_acl_dup(lli->lli_posix_acl);
3086 spin_unlock(&lli->lli_lock);
3092 int ll_inode_permission(struct inode *inode, int mask)
3096 #ifdef MAY_NOT_BLOCK
3097 if (mask & MAY_NOT_BLOCK)
3101 /* as root inode are NOT getting validated in lookup operation,
3102 * need to do it before permission check. */
3104 if (is_root_inode(inode)) {
3105 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3106 MDS_INODELOCK_LOOKUP);
3111 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3112 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3114 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3115 return lustre_check_remote_perm(inode, mask);
3117 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3118 rc = generic_permission(inode, mask);
3123 /* -o localflock - only provides locally consistent flock locks */
3124 struct file_operations ll_file_operations = {
3125 .read = new_sync_read,
3126 .read_iter = ll_file_read_iter,
3127 .write = new_sync_write,
3128 .write_iter = ll_file_write_iter,
3129 .unlocked_ioctl = ll_file_ioctl,
3130 .open = ll_file_open,
3131 .release = ll_file_release,
3132 .mmap = ll_file_mmap,
3133 .llseek = ll_file_seek,
3134 .splice_read = ll_file_splice_read,
3139 struct file_operations ll_file_operations_flock = {
3140 .read = new_sync_read,
3141 .read_iter = ll_file_read_iter,
3142 .write = new_sync_write,
3143 .write_iter = ll_file_write_iter,
3144 .unlocked_ioctl = ll_file_ioctl,
3145 .open = ll_file_open,
3146 .release = ll_file_release,
3147 .mmap = ll_file_mmap,
3148 .llseek = ll_file_seek,
3149 .splice_read = ll_file_splice_read,
3152 .flock = ll_file_flock,
3153 .lock = ll_file_flock
3156 /* These are for -o noflock - to return ENOSYS on flock calls */
3157 struct file_operations ll_file_operations_noflock = {
3158 .read = new_sync_read,
3159 .read_iter = ll_file_read_iter,
3160 .write = new_sync_write,
3161 .write_iter = ll_file_write_iter,
3162 .unlocked_ioctl = ll_file_ioctl,
3163 .open = ll_file_open,
3164 .release = ll_file_release,
3165 .mmap = ll_file_mmap,
3166 .llseek = ll_file_seek,
3167 .splice_read = ll_file_splice_read,
3170 .flock = ll_file_noflock,
3171 .lock = ll_file_noflock
3174 struct inode_operations ll_file_inode_operations = {
3175 .setattr = ll_setattr,
3176 .getattr = ll_getattr,
3177 .permission = ll_inode_permission,
3178 .setxattr = ll_setxattr,
3179 .getxattr = ll_getxattr,
3180 .listxattr = ll_listxattr,
3181 .removexattr = ll_removexattr,
3182 .fiemap = ll_fiemap,
3183 .get_acl = ll_get_acl,
3186 /* dynamic ioctl number support routines */
3187 static struct llioc_ctl_data {
3188 struct rw_semaphore ioc_sem;
3189 struct list_head ioc_head;
3191 __RWSEM_INITIALIZER(llioc.ioc_sem),
3192 LIST_HEAD_INIT(llioc.ioc_head)
3197 struct list_head iocd_list;
3198 unsigned int iocd_size;
3199 llioc_callback_t iocd_cb;
3200 unsigned int iocd_count;
3201 unsigned int iocd_cmd[0];
3204 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3207 struct llioc_data *in_data = NULL;
3209 if (cb == NULL || cmd == NULL ||
3210 count > LLIOC_MAX_CMD || count < 0)
3213 size = sizeof(*in_data) + count * sizeof(unsigned int);
3214 in_data = kzalloc(size, GFP_NOFS);
3218 memset(in_data, 0, sizeof(*in_data));
3219 in_data->iocd_size = size;
3220 in_data->iocd_cb = cb;
3221 in_data->iocd_count = count;
3222 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3224 down_write(&llioc.ioc_sem);
3225 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3226 up_write(&llioc.ioc_sem);
3231 void ll_iocontrol_unregister(void *magic)
3233 struct llioc_data *tmp;
3238 down_write(&llioc.ioc_sem);
3239 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3241 unsigned int size = tmp->iocd_size;
3243 list_del(&tmp->iocd_list);
3244 up_write(&llioc.ioc_sem);
3246 OBD_FREE(tmp, size);
3250 up_write(&llioc.ioc_sem);
3252 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3255 EXPORT_SYMBOL(ll_iocontrol_register);
3256 EXPORT_SYMBOL(ll_iocontrol_unregister);
3258 static enum llioc_iter
3259 ll_iocontrol_call(struct inode *inode, struct file *file,
3260 unsigned int cmd, unsigned long arg, int *rcp)
3262 enum llioc_iter ret = LLIOC_CONT;
3263 struct llioc_data *data;
3264 int rc = -EINVAL, i;
3266 down_read(&llioc.ioc_sem);
3267 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3268 for (i = 0; i < data->iocd_count; i++) {
3269 if (cmd != data->iocd_cmd[i])
3272 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3276 if (ret == LLIOC_STOP)
3279 up_read(&llioc.ioc_sem);
3286 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3288 struct ll_inode_info *lli = ll_i2info(inode);
3289 struct cl_env_nest nest;
3293 if (lli->lli_clob == NULL)
3296 env = cl_env_nested_get(&nest);
3298 return PTR_ERR(env);
3300 result = cl_conf_set(env, lli->lli_clob, conf);
3301 cl_env_nested_put(&nest, env);
3303 if (conf->coc_opc == OBJECT_CONF_SET) {
3304 struct ldlm_lock *lock = conf->coc_lock;
3306 LASSERT(lock != NULL);
3307 LASSERT(ldlm_has_layout(lock));
3309 /* it can only be allowed to match after layout is
3310 * applied to inode otherwise false layout would be
3311 * seen. Applying layout should happen before dropping
3312 * the intent lock. */
3313 ldlm_lock_allow_match(lock);
3319 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3320 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3323 struct ll_sb_info *sbi = ll_i2sbi(inode);
3324 struct obd_capa *oc;
3325 struct ptlrpc_request *req;
3326 struct mdt_body *body;
3332 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3333 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3334 lock->l_lvb_data, lock->l_lvb_len);
3336 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3339 /* if layout lock was granted right away, the layout is returned
3340 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3341 * blocked and then granted via completion ast, we have to fetch
3342 * layout here. Please note that we can't use the LVB buffer in
3343 * completion AST because it doesn't have a large enough buffer */
3344 oc = ll_mdscapa_get(inode);
3345 rc = ll_get_default_mdsize(sbi, &lmmsize);
3347 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3348 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3354 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3360 lmmsize = body->eadatasize;
3361 if (lmmsize == 0) /* empty layout */ {
3366 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3372 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3373 if (lvbdata == NULL) {
3378 memcpy(lvbdata, lmm, lmmsize);
3379 lock_res_and_lock(lock);
3380 if (lock->l_lvb_data != NULL)
3381 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3383 lock->l_lvb_data = lvbdata;
3384 lock->l_lvb_len = lmmsize;
3385 unlock_res_and_lock(lock);
3388 ptlrpc_req_finished(req);
3393 * Apply the layout to the inode. Layout lock is held and will be released
3396 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3397 struct inode *inode, __u32 *gen, bool reconf)
3399 struct ll_inode_info *lli = ll_i2info(inode);
3400 struct ll_sb_info *sbi = ll_i2sbi(inode);
3401 struct ldlm_lock *lock;
3402 struct lustre_md md = { NULL };
3403 struct cl_object_conf conf;
3406 bool wait_layout = false;
3408 LASSERT(lustre_handle_is_used(lockh));
3410 lock = ldlm_handle2lock(lockh);
3411 LASSERT(lock != NULL);
3412 LASSERT(ldlm_has_layout(lock));
3414 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3415 inode, PFID(&lli->lli_fid), reconf);
3417 /* in case this is a caching lock and reinstate with new inode */
3418 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3420 lock_res_and_lock(lock);
3421 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3422 unlock_res_and_lock(lock);
3423 /* checking lvb_ready is racy but this is okay. The worst case is
3424 * that multi processes may configure the file on the same time. */
3425 if (lvb_ready || !reconf) {
3428 /* layout_gen must be valid if layout lock is not
3429 * cancelled and stripe has already set */
3430 *gen = ll_layout_version_get(lli);
3436 rc = ll_layout_fetch(inode, lock);
3440 /* for layout lock, lmm is returned in lock's lvb.
3441 * lvb_data is immutable if the lock is held so it's safe to access it
3442 * without res lock. See the description in ldlm_lock_decref_internal()
3443 * for the condition to free lvb_data of layout lock */
3444 if (lock->l_lvb_data != NULL) {
3445 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3446 lock->l_lvb_data, lock->l_lvb_len);
3448 *gen = LL_LAYOUT_GEN_EMPTY;
3450 *gen = md.lsm->lsm_layout_gen;
3453 CERROR("%s: file "DFID" unpackmd error: %d\n",
3454 ll_get_fsname(inode->i_sb, NULL, 0),
3455 PFID(&lli->lli_fid), rc);
3461 /* set layout to file. Unlikely this will fail as old layout was
3462 * surely eliminated */
3463 memset(&conf, 0, sizeof(conf));
3464 conf.coc_opc = OBJECT_CONF_SET;
3465 conf.coc_inode = inode;
3466 conf.coc_lock = lock;
3467 conf.u.coc_md = &md;
3468 rc = ll_layout_conf(inode, &conf);
3471 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3473 /* refresh layout failed, need to wait */
3474 wait_layout = rc == -EBUSY;
3477 LDLM_LOCK_PUT(lock);
3478 ldlm_lock_decref(lockh, mode);
3480 /* wait for IO to complete if it's still being used. */
3482 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3483 ll_get_fsname(inode->i_sb, NULL, 0),
3484 inode, PFID(&lli->lli_fid));
3486 memset(&conf, 0, sizeof(conf));
3487 conf.coc_opc = OBJECT_CONF_WAIT;
3488 conf.coc_inode = inode;
3489 rc = ll_layout_conf(inode, &conf);
3493 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3494 PFID(&lli->lli_fid), rc);
3500 * This function checks if there exists a LAYOUT lock on the client side,
3501 * or enqueues it if it doesn't have one in cache.
3503 * This function will not hold layout lock so it may be revoked any time after
3504 * this function returns. Any operations depend on layout should be redone
3507 * This function should be called before lov_io_init() to get an uptodate
3508 * layout version, the caller should save the version number and after IO
3509 * is finished, this function should be called again to verify that layout
3510 * is not changed during IO time.
3512 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3514 struct ll_inode_info *lli = ll_i2info(inode);
3515 struct ll_sb_info *sbi = ll_i2sbi(inode);
3516 struct md_op_data *op_data;
3517 struct lookup_intent it;
3518 struct lustre_handle lockh;
3520 struct ldlm_enqueue_info einfo = {
3521 .ei_type = LDLM_IBITS,
3523 .ei_cb_bl = ll_md_blocking_ast,
3524 .ei_cb_cp = ldlm_completion_ast,
3528 *gen = ll_layout_version_get(lli);
3529 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3533 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3534 LASSERT(S_ISREG(inode->i_mode));
3536 /* take layout lock mutex to enqueue layout lock exclusively. */
3537 mutex_lock(&lli->lli_layout_mutex);
3540 /* mostly layout lock is caching on the local side, so try to match
3541 * it before grabbing layout lock mutex. */
3542 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3543 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3544 if (mode != 0) { /* hit cached lock */
3545 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3549 mutex_unlock(&lli->lli_layout_mutex);
3553 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3554 0, 0, LUSTRE_OPC_ANY, NULL);
3555 if (IS_ERR(op_data)) {
3556 mutex_unlock(&lli->lli_layout_mutex);
3557 return PTR_ERR(op_data);
3560 /* have to enqueue one */
3561 memset(&it, 0, sizeof(it));
3562 it.it_op = IT_LAYOUT;
3563 lockh.cookie = 0ULL;
3565 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3566 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3567 PFID(&lli->lli_fid));
3569 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3571 if (it.d.lustre.it_data != NULL)
3572 ptlrpc_req_finished(it.d.lustre.it_data);
3573 it.d.lustre.it_data = NULL;
3575 ll_finish_md_op_data(op_data);
3577 mode = it.d.lustre.it_lock_mode;
3578 it.d.lustre.it_lock_mode = 0;
3579 ll_intent_drop_lock(&it);
3582 /* set lock data in case this is a new lock */
3583 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3584 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3588 mutex_unlock(&lli->lli_layout_mutex);
3594 * This function send a restore request to the MDT
3596 int ll_layout_restore(struct inode *inode)
3598 struct hsm_user_request *hur;
3601 len = sizeof(struct hsm_user_request) +
3602 sizeof(struct hsm_user_item);
3603 hur = kzalloc(len, GFP_NOFS);
3607 hur->hur_request.hr_action = HUA_RESTORE;
3608 hur->hur_request.hr_archive_id = 0;
3609 hur->hur_request.hr_flags = 0;
3610 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3611 sizeof(hur->hur_user_item[0].hui_fid));
3612 hur->hur_user_item[0].hui_extent.length = -1;
3613 hur->hur_request.hr_itemcount = 1;
3614 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,