drivers/staging/lustre/lustre/llite/llite_lib.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2015, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/llite_lib.c
  33  *
  34  * Lustre Light Super operations
  35  */
  36
  37 #define DEBUG_SUBSYSTEM S_LLITE
  38
  39 #include <linux/module.h>
  40 #include <linux/statfs.h>
  41 #include <linux/types.h>
  42 #include <linux/mm.h>
  43
  44 #include "../include/lustre_lite.h"
  45 #include "../include/lustre_ha.h"
  46 #include "../include/lustre_dlm.h"
  47 #include "../include/lprocfs_status.h"
  48 #include "../include/lustre_disk.h"
  49 #include "../include/lustre_param.h"
  50 #include "../include/lustre_log.h"
  51 #include "../include/cl_object.h"
  52 #include "../include/obd_cksum.h"
  53 #include "llite_internal.h"
  54
  55 struct kmem_cache *ll_file_data_slab;
  56 struct dentry *llite_root;
  57 struct kset *llite_kset;
  58
  59 #ifndef log2
  60 #define log2(n) ffz(~(n))
  61 #endif
  62
  63 static struct ll_sb_info *ll_init_sbi(struct super_block *sb)
  64 {
  65         struct ll_sb_info *sbi = NULL;
  66         unsigned long pages;
  67         unsigned long lru_page_max;
  68         struct sysinfo si;
  69         class_uuid_t uuid;
  70         int i;
  71
  72         sbi = kzalloc(sizeof(*sbi), GFP_NOFS);
  73         if (!sbi)
  74                 return NULL;
  75
  76         spin_lock_init(&sbi->ll_lock);
  77         mutex_init(&sbi->ll_lco.lco_lock);
  78         spin_lock_init(&sbi->ll_pp_extent_lock);
  79         spin_lock_init(&sbi->ll_process_lock);
  80         sbi->ll_rw_stats_on = 0;
  81
  82         si_meminfo(&si);
  83         pages = si.totalram - si.totalhigh;
  84         lru_page_max = pages / 2;
  85
  86         sbi->ll_cache = cl_cache_init(lru_page_max);
  87         if (!sbi->ll_cache) {
  88                 kfree(sbi);
  89                 return NULL;
  90         }
  91
  92         sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32,
  93                                            SBI_DEFAULT_READAHEAD_MAX);
  94         sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
  95         sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
  96                                            SBI_DEFAULT_READAHEAD_WHOLE_MAX;
  97
  98         ll_generate_random_uuid(uuid);
  99         class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
 100         CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid);
 101
 102         sbi->ll_flags |= LL_SBI_VERBOSE;
 103         sbi->ll_flags |= LL_SBI_CHECKSUM;
 104
 105         sbi->ll_flags |= LL_SBI_LRU_RESIZE;
 106
 107         for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
 108                 spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
 109                                pp_r_hist.oh_lock);
 110                 spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
 111                                pp_w_hist.oh_lock);
 112         }
 113
 114         /* metadata statahead is enabled by default */
 115         sbi->ll_sa_max = LL_SA_RPC_DEF;
 116         atomic_set(&sbi->ll_sa_total, 0);
 117         atomic_set(&sbi->ll_sa_wrong, 0);
 118         atomic_set(&sbi->ll_agl_total, 0);
 119         sbi->ll_flags |= LL_SBI_AGL_ENABLED;
 120
 121         sbi->ll_sb = sb;
 122
 123         return sbi;
 124 }
 125
 126 static void ll_free_sbi(struct super_block *sb)
 127 {
 128         struct ll_sb_info *sbi = ll_s2sbi(sb);
 129
 130         if (sbi->ll_cache) {
 131                 cl_cache_decref(sbi->ll_cache);
 132                 sbi->ll_cache = NULL;
 133         }
 134
 135         kfree(sbi);
 136 }
 137
 138 static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 139                                     struct vfsmount *mnt)
 140 {
 141         struct inode *root = NULL;
 142         struct ll_sb_info *sbi = ll_s2sbi(sb);
 143         struct obd_device *obd;
 144         struct obd_statfs *osfs = NULL;
 145         struct ptlrpc_request *request = NULL;
 146         struct obd_connect_data *data = NULL;
 147         struct obd_uuid *uuid;
 148         struct md_op_data *op_data;
 149         struct lustre_md lmd;
 150         u64 valid;
 151         int size, err, checksum;
 152
 153         obd = class_name2obd(md);
 154         if (!obd) {
 155                 CERROR("MD %s: not setup or attached\n", md);
 156                 return -EINVAL;
 157         }
 158
 159         data = kzalloc(sizeof(*data), GFP_NOFS);
 160         if (!data)
 161                 return -ENOMEM;
 162
 163         osfs = kzalloc(sizeof(*osfs), GFP_NOFS);
 164         if (!osfs) {
 165                 kfree(data);
 166                 return -ENOMEM;
 167         }
 168
 169         /* indicate the features supported by this client */
 170         data->ocd_connect_flags = OBD_CONNECT_IBITS    | OBD_CONNECT_NODEVOH  |
 171                                   OBD_CONNECT_ATTRFID  |
 172                                   OBD_CONNECT_VERSION  | OBD_CONNECT_BRW_SIZE |
 173                                   OBD_CONNECT_CANCELSET | OBD_CONNECT_FID     |
 174                                   OBD_CONNECT_AT       | OBD_CONNECT_LOV_V3   |
 175                                   OBD_CONNECT_VBR       | OBD_CONNECT_FULL20  |
 176                                   OBD_CONNECT_64BITHASH |
 177                                   OBD_CONNECT_EINPROGRESS |
 178                                   OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
 179                                   OBD_CONNECT_LAYOUTLOCK |
 180                                   OBD_CONNECT_PINGLESS |
 181                                   OBD_CONNECT_MAX_EASIZE |
 182                                   OBD_CONNECT_FLOCK_DEAD |
 183                                   OBD_CONNECT_DISP_STRIPE;
 184
 185         if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
 186                 data->ocd_connect_flags |= OBD_CONNECT_SOM;
 187
 188         if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
 189                 data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
 190 #ifdef CONFIG_FS_POSIX_ACL
 191         data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK;
 192 #endif
 193
 194         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT))
 195                 /* flag mdc connection as lightweight, only used for test
 196                  * purpose, use with care
 197                  */
 198                 data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT;
 199
 200         data->ocd_ibits_known = MDS_INODELOCK_FULL;
 201         data->ocd_version = LUSTRE_VERSION_CODE;
 202
 203         if (sb->s_flags & MS_RDONLY)
 204                 data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
 205         if (sbi->ll_flags & LL_SBI_USER_XATTR)
 206                 data->ocd_connect_flags |= OBD_CONNECT_XATTR;
 207
 208         if (sbi->ll_flags & LL_SBI_FLOCK)
 209                 sbi->ll_fop = &ll_file_operations_flock;
 210         else if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
 211                 sbi->ll_fop = &ll_file_operations;
 212         else
 213                 sbi->ll_fop = &ll_file_operations_noflock;
 214
 215         /* real client */
 216         data->ocd_connect_flags |= OBD_CONNECT_REAL;
 217
 218         data->ocd_brw_size = MD_MAX_BRW_SIZE;
 219
 220         err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid,
 221                           data, NULL);
 222         if (err == -EBUSY) {
 223                 LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n",
 224                                    md);
 225                 goto out;
 226         } else if (err) {
 227                 CERROR("cannot connect to %s: rc = %d\n", md, err);
 228                 goto out;
 229         }
 230
 231         sbi->ll_md_exp->exp_connect_data = *data;
 232
 233         err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp,
 234                            LUSTRE_SEQ_METADATA);
 235         if (err) {
 236                 CERROR("%s: Can't init metadata layer FID infrastructure, rc = %d\n",
 237                        sbi->ll_md_exp->exp_obd->obd_name, err);
 238                 goto out_md;
 239         }
 240
 241         /* For mount, we only need fs info from MDT0, and also in DNE, it
 242          * can make sure the client can be mounted as long as MDT0 is
 243          * available
 244          */
 245         err = obd_statfs(NULL, sbi->ll_md_exp, osfs,
 246                          cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
 247                          OBD_STATFS_FOR_MDT0);
 248         if (err)
 249                 goto out_md_fid;
 250
 251         /* This needs to be after statfs to ensure connect has finished.
 252          * Note that "data" does NOT contain the valid connect reply.
 253          * If connecting to a 1.8 server there will be no LMV device, so
 254          * we can access the MDC export directly and exp_connect_flags will
 255          * be non-zero, but if accessing an upgraded 2.1 server it will
 256          * have the correct flags filled in.
 257          * XXX: fill in the LMV exp_connect_flags from MDC(s).
 258          */
 259         valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD;
 260         if (exp_connect_flags(sbi->ll_md_exp) != 0 &&
 261             valid != CLIENT_CONNECT_MDT_REQD) {
 262                 char *buf;
 263
 264                 buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
 265                 if (!buf) {
 266                         err = -ENOMEM;
 267                         goto out_md_fid;
 268                 }
 269                 obd_connect_flags2str(buf, PAGE_SIZE,
 270                                       valid ^ CLIENT_CONNECT_MDT_REQD, ",");
 271                 LCONSOLE_ERROR_MSG(0x170, "Server %s does not support feature(s) needed for correct operation of this client (%s). Please upgrade server or downgrade client.\n",
 272                                    sbi->ll_md_exp->exp_obd->obd_name, buf);
 273                 kfree(buf);
 274                 err = -EPROTO;
 275                 goto out_md_fid;
 276         }
 277
 278         size = sizeof(*data);
 279         err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA),
 280                            KEY_CONN_DATA,  &size, data, NULL);
 281         if (err) {
 282                 CERROR("%s: Get connect data failed: rc = %d\n",
 283                        sbi->ll_md_exp->exp_obd->obd_name, err);
 284                 goto out_md_fid;
 285         }
 286
 287         LASSERT(osfs->os_bsize);
 288         sb->s_blocksize = osfs->os_bsize;
 289         sb->s_blocksize_bits = log2(osfs->os_bsize);
 290         sb->s_magic = LL_SUPER_MAGIC;
 291         sb->s_maxbytes = MAX_LFS_FILESIZE;
 292         sbi->ll_namelen = osfs->os_namelen;
 293
 294         if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
 295             !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
 296                 LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n");
 297                 sbi->ll_flags &= ~LL_SBI_USER_XATTR;
 298         }
 299
 300         if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
 301                 sb->s_flags |= MS_POSIXACL;
 302                 sbi->ll_flags |= LL_SBI_ACL;
 303         } else {
 304                 LCONSOLE_INFO("client wants to enable acl, but mdt not!\n");
 305                 sb->s_flags &= ~MS_POSIXACL;
 306                 sbi->ll_flags &= ~LL_SBI_ACL;
 307         }
 308
 309         if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH)
 310                 sbi->ll_flags |= LL_SBI_64BIT_HASH;
 311
 312         if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
 313                 sbi->ll_md_brw_size = data->ocd_brw_size;
 314         else
 315                 sbi->ll_md_brw_size = PAGE_SIZE;
 316
 317         if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK)
 318                 sbi->ll_flags |= LL_SBI_LAYOUT_LOCK;
 319
 320         if (data->ocd_ibits_known & MDS_INODELOCK_XATTR) {
 321                 if (!(data->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)) {
 322                         LCONSOLE_INFO(
 323                                 "%s: disabling xattr cache due to unknown maximum xattr size.\n",
 324                                 dt);
 325                 } else {
 326                         sbi->ll_flags |= LL_SBI_XATTR_CACHE;
 327                         sbi->ll_xattr_cache_enabled = 1;
 328                 }
 329         }
 330
 331         obd = class_name2obd(dt);
 332         if (!obd) {
 333                 CERROR("DT %s: not setup or attached\n", dt);
 334                 err = -ENODEV;
 335                 goto out_md_fid;
 336         }
 337
 338         data->ocd_connect_flags = OBD_CONNECT_GRANT     | OBD_CONNECT_VERSION  |
 339                                   OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE |
 340                                   OBD_CONNECT_CANCELSET | OBD_CONNECT_FID      |
 341                                   OBD_CONNECT_SRVLOCK   | OBD_CONNECT_TRUNCLOCK|
 342                                   OBD_CONNECT_AT        | OBD_CONNECT_OSS_CAPA |
 343                                   OBD_CONNECT_VBR       | OBD_CONNECT_FULL20   |
 344                                   OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES |
 345                                   OBD_CONNECT_EINPROGRESS |
 346                                   OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE |
 347                                   OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS;
 348
 349         if (sbi->ll_flags & LL_SBI_SOM_PREVIEW)
 350                 data->ocd_connect_flags |= OBD_CONNECT_SOM;
 351
 352         if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) {
 353                 /* OBD_CONNECT_CKSUM should always be set, even if checksums are
 354                  * disabled by default, because it can still be enabled on the
 355                  * fly via /sys. As a consequence, we still need to come to an
 356                  * agreement on the supported algorithms at connect time
 357                  */
 358                 data->ocd_connect_flags |= OBD_CONNECT_CKSUM;
 359
 360                 if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY))
 361                         data->ocd_cksum_types = OBD_CKSUM_ADLER;
 362                 else
 363                         data->ocd_cksum_types = cksum_types_supported_client();
 364         }
 365
 366         data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE;
 367
 368         CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d\n",
 369                data->ocd_connect_flags,
 370                data->ocd_version, data->ocd_grant);
 371
 372         obd->obd_upcall.onu_owner = &sbi->ll_lco;
 373         obd->obd_upcall.onu_upcall = cl_ocd_update;
 374
 375         data->ocd_brw_size = DT_MAX_BRW_SIZE;
 376
 377         err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data,
 378                           NULL);
 379         if (err == -EBUSY) {
 380                 LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing recovery, of which this client is not a part.  Please wait for recovery to complete, abort, or time out.\n",
 381                                    dt);
 382                 goto out_md;
 383         } else if (err) {
 384                 CERROR("%s: Cannot connect to %s: rc = %d\n",
 385                        sbi->ll_dt_exp->exp_obd->obd_name, dt, err);
 386                 goto out_md;
 387         }
 388
 389         sbi->ll_dt_exp->exp_connect_data = *data;
 390
 391         err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp,
 392                            LUSTRE_SEQ_METADATA);
 393         if (err) {
 394                 CERROR("%s: Can't init data layer FID infrastructure, rc = %d\n",
 395                        sbi->ll_dt_exp->exp_obd->obd_name, err);
 396                 goto out_dt;
 397         }
 398
 399         mutex_lock(&sbi->ll_lco.lco_lock);
 400         sbi->ll_lco.lco_flags = data->ocd_connect_flags;
 401         sbi->ll_lco.lco_md_exp = sbi->ll_md_exp;
 402         sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp;
 403         mutex_unlock(&sbi->ll_lco.lco_lock);
 404
 405         fid_zero(&sbi->ll_root_fid);
 406         err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid);
 407         if (err) {
 408                 CERROR("cannot mds_connect: rc = %d\n", err);
 409                 goto out_lock_cn_cb;
 410         }
 411         if (!fid_is_sane(&sbi->ll_root_fid)) {
 412                 CERROR("%s: Invalid root fid "DFID" during mount\n",
 413                        sbi->ll_md_exp->exp_obd->obd_name,
 414                        PFID(&sbi->ll_root_fid));
 415                 err = -EINVAL;
 416                 goto out_lock_cn_cb;
 417         }
 418         CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid));
 419
 420         sb->s_op = &lustre_super_operations;
 421 #if THREAD_SIZE >= 8192 /*b=17630*/
 422         sb->s_export_op = &lustre_export_operations;
 423 #endif
 424
 425         /* make root inode
 426          * XXX: move this to after cbd setup?
 427          */
 428         valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMODEASIZE;
 429         if (sbi->ll_flags & LL_SBI_ACL)
 430                 valid |= OBD_MD_FLACL;
 431
 432         op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
 433         if (!op_data) {
 434                 err = -ENOMEM;
 435                 goto out_lock_cn_cb;
 436         }
 437
 438         op_data->op_fid1 = sbi->ll_root_fid;
 439         op_data->op_mode = 0;
 440         op_data->op_valid = valid;
 441
 442         err = md_getattr(sbi->ll_md_exp, op_data, &request);
 443         kfree(op_data);
 444         if (err) {
 445                 CERROR("%s: md_getattr failed for root: rc = %d\n",
 446                        sbi->ll_md_exp->exp_obd->obd_name, err);
 447                 goto out_lock_cn_cb;
 448         }
 449
 450         err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
 451                                sbi->ll_md_exp, &lmd);
 452         if (err) {
 453                 CERROR("failed to understand root inode md: rc = %d\n", err);
 454                 ptlrpc_req_finished(request);
 455                 goto out_lock_cn_cb;
 456         }
 457
 458         LASSERT(fid_is_sane(&sbi->ll_root_fid));
 459         root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid,
 460                                             sbi->ll_flags & LL_SBI_32BIT_API),
 461                        &lmd);
 462         md_free_lustre_md(sbi->ll_md_exp, &lmd);
 463         ptlrpc_req_finished(request);
 464
 465         if (!(root)) {
 466                 if (lmd.lsm)
 467                         obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm);
 468 #ifdef CONFIG_FS_POSIX_ACL
 469                 if (lmd.posix_acl) {
 470                         posix_acl_release(lmd.posix_acl);
 471                         lmd.posix_acl = NULL;
 472                 }
 473 #endif
 474                 err = -EBADF;
 475                 CERROR("lustre_lite: bad iget4 for root\n");
 476                 goto out_root;
 477         }
 478
 479         err = ll_close_thread_start(&sbi->ll_lcq);
 480         if (err) {
 481                 CERROR("cannot start close thread: rc %d\n", err);
 482                 goto out_root;
 483         }
 484
 485         checksum = sbi->ll_flags & LL_SBI_CHECKSUM;
 486         err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
 487                                  KEY_CHECKSUM, sizeof(checksum), &checksum,
 488                                  NULL);
 489         cl_sb_init(sb);
 490
 491         err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET),
 492                                  KEY_CACHE_SET, sizeof(*sbi->ll_cache),
 493                                  sbi->ll_cache, NULL);
 494
 495         sb->s_root = d_make_root(root);
 496         if (!sb->s_root) {
 497                 CERROR("%s: can't make root dentry\n",
 498                        ll_get_fsname(sb, NULL, 0));
 499                 err = -ENOMEM;
 500                 goto out_lock_cn_cb;
 501         }
 502
 503         sbi->ll_sdev_orig = sb->s_dev;
 504
 505         /* We set sb->s_dev equal on all lustre clients in order to support
 506          * NFS export clustering.  NFSD requires that the FSID be the same
 507          * on all clients.
 508          */
 509         /* s_dev is also used in lt_compare() to compare two fs, but that is
 510          * only a node-local comparison.
 511          */
 512         uuid = obd_get_uuid(sbi->ll_md_exp);
 513         if (uuid) {
 514                 sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
 515                 get_uuid2fsid(uuid->uuid, strlen(uuid->uuid), &sbi->ll_fsid);
 516         }
 517
 518         kfree(data);
 519         kfree(osfs);
 520
 521         if (llite_root) {
 522                 err = ldebugfs_register_mountpoint(llite_root, sb, dt, md);
 523                 if (err < 0) {
 524                         CERROR("%s: could not register mount in debugfs: "
 525                                "rc = %d\n", ll_get_fsname(sb, NULL, 0), err);
 526                         err = 0;
 527                 }
 528         }
 529
 530         return err;
 531 out_root:
 532         iput(root);
 533 out_lock_cn_cb:
 534         obd_fid_fini(sbi->ll_dt_exp->exp_obd);
 535 out_dt:
 536         obd_disconnect(sbi->ll_dt_exp);
 537         sbi->ll_dt_exp = NULL;
 538 out_md_fid:
 539         obd_fid_fini(sbi->ll_md_exp->exp_obd);
 540 out_md:
 541         obd_disconnect(sbi->ll_md_exp);
 542         sbi->ll_md_exp = NULL;
 543 out:
 544         kfree(data);
 545         kfree(osfs);
 546         return err;
 547 }
 548
 549 int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
 550 {
 551         int size, rc;
 552
 553         *lmmsize = obd_size_diskmd(sbi->ll_dt_exp, NULL);
 554         size = sizeof(int);
 555         rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE),
 556                           KEY_MAX_EASIZE, &size, lmmsize, NULL);
 557         if (rc)
 558                 CERROR("Get max mdsize error rc %d\n", rc);
 559
 560         return rc;
 561 }
 562
 563 int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize)
 564 {
 565         int size, rc;
 566
 567         size = sizeof(int);
 568         rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_EASIZE),
 569                           KEY_DEFAULT_EASIZE, &size, lmmsize, NULL);
 570         if (rc)
 571                 CERROR("Get default mdsize error rc %d\n", rc);
 572
 573         return rc;
 574 }
 575
 576 static void client_common_put_super(struct super_block *sb)
 577 {
 578         struct ll_sb_info *sbi = ll_s2sbi(sb);
 579
 580         ll_close_thread_shutdown(sbi->ll_lcq);
 581
 582         cl_sb_fini(sb);
 583
 584         obd_fid_fini(sbi->ll_dt_exp->exp_obd);
 585         obd_disconnect(sbi->ll_dt_exp);
 586         sbi->ll_dt_exp = NULL;
 587
 588         ldebugfs_unregister_mountpoint(sbi);
 589
 590         obd_fid_fini(sbi->ll_md_exp->exp_obd);
 591         obd_disconnect(sbi->ll_md_exp);
 592         sbi->ll_md_exp = NULL;
 593 }
 594
 595 void ll_kill_super(struct super_block *sb)
 596 {
 597         struct ll_sb_info *sbi;
 598
 599         /* not init sb ?*/
 600         if (!(sb->s_flags & MS_ACTIVE))
 601                 return;
 602
 603         sbi = ll_s2sbi(sb);
 604         /* we need to restore s_dev from changed for clustered NFS before
 605          * put_super because new kernels have cached s_dev and change sb->s_dev
 606          * in put_super not affected real removing devices
 607          */
 608         if (sbi) {
 609                 sb->s_dev = sbi->ll_sdev_orig;
 610                 sbi->ll_umounting = 1;
 611         }
 612 }
 613
 614 static inline int ll_set_opt(const char *opt, char *data, int fl)
 615 {
 616         if (strncmp(opt, data, strlen(opt)) != 0)
 617                 return 0;
 618         else
 619                 return fl;
 620 }
 621
 622 /* non-client-specific mount options are parsed in lmd_parse */
 623 static int ll_options(char *options, int *flags)
 624 {
 625         int tmp;
 626         char *s1 = options, *s2;
 627
 628         if (!options)
 629                 return 0;
 630
 631         CDEBUG(D_CONFIG, "Parsing opts %s\n", options);
 632
 633         while (*s1) {
 634                 CDEBUG(D_SUPER, "next opt=%s\n", s1);
 635                 tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK);
 636                 if (tmp) {
 637                         *flags |= tmp;
 638                         goto next;
 639                 }
 640                 tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK);
 641                 if (tmp) {
 642                         *flags |= tmp;
 643                         goto next;
 644                 }
 645                 tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK);
 646                 if (tmp) {
 647                         *flags |= tmp;
 648                         goto next;
 649                 }
 650                 tmp = ll_set_opt("noflock", s1, LL_SBI_FLOCK|LL_SBI_LOCALFLOCK);
 651                 if (tmp) {
 652                         *flags &= ~tmp;
 653                         goto next;
 654                 }
 655                 tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR);
 656                 if (tmp) {
 657                         *flags |= tmp;
 658                         goto next;
 659                 }
 660                 tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR);
 661                 if (tmp) {
 662                         *flags &= ~tmp;
 663                         goto next;
 664                 }
 665                 tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH);
 666                 if (tmp) {
 667                         *flags |= tmp;
 668                         goto next;
 669                 }
 670                 tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH);
 671                 if (tmp) {
 672                         *flags &= ~tmp;
 673                         goto next;
 674                 }
 675
 676                 tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM);
 677                 if (tmp) {
 678                         *flags |= tmp;
 679                         goto next;
 680                 }
 681                 tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM);
 682                 if (tmp) {
 683                         *flags &= ~tmp;
 684                         goto next;
 685                 }
 686                 tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE);
 687                 if (tmp) {
 688                         *flags |= tmp;
 689                         goto next;
 690                 }
 691                 tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE);
 692                 if (tmp) {
 693                         *flags &= ~tmp;
 694                         goto next;
 695                 }
 696                 tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS);
 697                 if (tmp) {
 698                         *flags |= tmp;
 699                         goto next;
 700                 }
 701                 tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS);
 702                 if (tmp) {
 703                         *flags &= ~tmp;
 704                         goto next;
 705                 }
 706                 tmp = ll_set_opt("som_preview", s1, LL_SBI_SOM_PREVIEW);
 707                 if (tmp) {
 708                         *flags |= tmp;
 709                         goto next;
 710                 }
 711                 tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API);
 712                 if (tmp) {
 713                         *flags |= tmp;
 714                         goto next;
 715                 }
 716                 tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE);
 717                 if (tmp) {
 718                         *flags |= tmp;
 719                         goto next;
 720                 }
 721                 tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE);
 722                 if (tmp) {
 723                         *flags &= ~tmp;
 724                         goto next;
 725                 }
 726                 LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n",
 727                                    s1);
 728                 return -EINVAL;
 729
 730 next:
 731                 /* Find next opt */
 732                 s2 = strchr(s1, ',');
 733                 if (!s2)
 734                         break;
 735                 s1 = s2 + 1;
 736         }
 737         return 0;
 738 }
 739
 740 void ll_lli_init(struct ll_inode_info *lli)
 741 {
 742         lli->lli_inode_magic = LLI_INODE_MAGIC;
 743         lli->lli_flags = 0;
 744         lli->lli_ioepoch = 0;
 745         lli->lli_maxbytes = MAX_LFS_FILESIZE;
 746         spin_lock_init(&lli->lli_lock);
 747         lli->lli_posix_acl = NULL;
 748         /* Do not set lli_fid, it has been initialized already. */
 749         fid_zero(&lli->lli_pfid);
 750         INIT_LIST_HEAD(&lli->lli_close_list);
 751         lli->lli_pending_och = NULL;
 752         lli->lli_mds_read_och = NULL;
 753         lli->lli_mds_write_och = NULL;
 754         lli->lli_mds_exec_och = NULL;
 755         lli->lli_open_fd_read_count = 0;
 756         lli->lli_open_fd_write_count = 0;
 757         lli->lli_open_fd_exec_count = 0;
 758         mutex_init(&lli->lli_och_mutex);
 759         spin_lock_init(&lli->lli_agl_lock);
 760         lli->lli_has_smd = false;
 761         spin_lock_init(&lli->lli_layout_lock);
 762         ll_layout_version_set(lli, LL_LAYOUT_GEN_NONE);
 763         lli->lli_clob = NULL;
 764
 765         init_rwsem(&lli->lli_xattrs_list_rwsem);
 766         mutex_init(&lli->lli_xattrs_enq_lock);
 767
 768         LASSERT(lli->lli_vfs_inode.i_mode != 0);
 769         if (S_ISDIR(lli->lli_vfs_inode.i_mode)) {
 770                 mutex_init(&lli->lli_readdir_mutex);
 771                 lli->lli_opendir_key = NULL;
 772                 lli->lli_sai = NULL;
 773                 spin_lock_init(&lli->lli_sa_lock);
 774                 lli->lli_opendir_pid = 0;
 775         } else {
 776                 mutex_init(&lli->lli_size_mutex);
 777                 lli->lli_symlink_name = NULL;
 778                 init_rwsem(&lli->lli_trunc_sem);
 779                 mutex_init(&lli->lli_write_mutex);
 780                 init_rwsem(&lli->lli_glimpse_sem);
 781                 lli->lli_glimpse_time = 0;
 782                 INIT_LIST_HEAD(&lli->lli_agl_list);
 783                 lli->lli_agl_index = 0;
 784                 lli->lli_async_rc = 0;
 785         }
 786         mutex_init(&lli->lli_layout_mutex);
 787 }
 788
 789 static inline int ll_bdi_register(struct backing_dev_info *bdi)
 790 {
 791         static atomic_t ll_bdi_num = ATOMIC_INIT(0);
 792
 793         bdi->name = "lustre";
 794         return bdi_register(bdi, NULL, "lustre-%d",
 795                             atomic_inc_return(&ll_bdi_num));
 796 }
 797
 798 int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 799 {
 800         struct lustre_profile *lprof = NULL;
 801         struct lustre_sb_info *lsi = s2lsi(sb);
 802         struct ll_sb_info *sbi;
 803         char  *dt = NULL, *md = NULL;
 804         char  *profilenm = get_profile_name(sb);
 805         struct config_llog_instance *cfg;
 806         int    err;
 807
 808         CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
 809
 810         cfg = kzalloc(sizeof(*cfg), GFP_NOFS);
 811         if (!cfg)
 812                 return -ENOMEM;
 813
 814         try_module_get(THIS_MODULE);
 815
 816         /* client additional sb info */
 817         sbi = ll_init_sbi(sb);
 818         lsi->lsi_llsbi = sbi;
 819         if (!sbi) {
 820                 module_put(THIS_MODULE);
 821                 kfree(cfg);
 822                 return -ENOMEM;
 823         }
 824
 825         err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags);
 826         if (err)
 827                 goto out_free;
 828
 829         err = bdi_init(&lsi->lsi_bdi);
 830         if (err)
 831                 goto out_free;
 832         lsi->lsi_flags |= LSI_BDI_INITIALIZED;
 833         lsi->lsi_bdi.capabilities = 0;
 834         err = ll_bdi_register(&lsi->lsi_bdi);
 835         if (err)
 836                 goto out_free;
 837
 838         sb->s_bdi = &lsi->lsi_bdi;
 839         /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
 840         sb->s_d_op = &ll_d_ops;
 841
 842         /* Generate a string unique to this super, in case some joker tries
 843          * to mount the same fs at two mount points.
 844          * Use the address of the super itself.
 845          */
 846         cfg->cfg_instance = sb;
 847         cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid;
 848         cfg->cfg_callback = class_config_llog_handler;
 849         /* set up client obds */
 850         err = lustre_process_log(sb, profilenm, cfg);
 851         if (err < 0)
 852                 goto out_free;
 853
 854         /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */
 855         lprof = class_get_profile(profilenm);
 856         if (!lprof) {
 857                 LCONSOLE_ERROR_MSG(0x156, "The client profile '%s' could not be read from the MGS.  Does that filesystem exist?\n",
 858                                    profilenm);
 859                 err = -EINVAL;
 860                 goto out_free;
 861         }
 862         CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm,
 863                lprof->lp_md, lprof->lp_dt);
 864
 865         dt = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_dt, cfg->cfg_instance);
 866         if (!dt) {
 867                 err = -ENOMEM;
 868                 goto out_free;
 869         }
 870
 871         md = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_md, cfg->cfg_instance);
 872         if (!md) {
 873                 err = -ENOMEM;
 874                 goto out_free;
 875         }
 876
 877         /* connections, registrations, sb setup */
 878         err = client_common_fill_super(sb, md, dt, mnt);
 879
 880 out_free:
 881         kfree(md);
 882         kfree(dt);
 883         if (err)
 884                 ll_put_super(sb);
 885         else if (sbi->ll_flags & LL_SBI_VERBOSE)
 886                 LCONSOLE_WARN("Mounted %s\n", profilenm);
 887
 888         kfree(cfg);
 889         return err;
 890 } /* ll_fill_super */
 891
 892 void ll_put_super(struct super_block *sb)
 893 {
 894         struct config_llog_instance cfg, params_cfg;
 895         struct obd_device *obd;
 896         struct lustre_sb_info *lsi = s2lsi(sb);
 897         struct ll_sb_info *sbi = ll_s2sbi(sb);
 898         char *profilenm = get_profile_name(sb);
 899         int ccc_count, next, force = 1, rc = 0;
 900
 901         CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm);
 902
 903         cfg.cfg_instance = sb;
 904         lustre_end_log(sb, profilenm, &cfg);
 905
 906         params_cfg.cfg_instance = sb;
 907         lustre_end_log(sb, PARAMS_FILENAME, &params_cfg);
 908
 909         if (sbi->ll_md_exp) {
 910                 obd = class_exp2obd(sbi->ll_md_exp);
 911                 if (obd)
 912                         force = obd->obd_force;
 913         }
 914
 915         /* Wait for unstable pages to be committed to stable storage */
 916         if (!force) {
 917                 struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
 918
 919                 rc = l_wait_event(sbi->ll_cache->ccc_unstable_waitq,
 920                                   !atomic_read(&sbi->ll_cache->ccc_unstable_nr),
 921                                   &lwi);
 922         }
 923
 924         ccc_count = atomic_read(&sbi->ll_cache->ccc_unstable_nr);
 925         if (!force && rc != -EINTR)
 926                 LASSERTF(!ccc_count, "count: %i\n", ccc_count);
 927
 928         /* We need to set force before the lov_disconnect in
 929          * lustre_common_put_super, since l_d cleans up osc's as well.
 930          */
 931         if (force) {
 932                 next = 0;
 933                 while ((obd = class_devices_in_group(&sbi->ll_sb_uuid,
 934                                                      &next)) != NULL) {
 935                         obd->obd_force = force;
 936                 }
 937         }
 938
 939         if (sbi->ll_lcq) {
 940                 /* Only if client_common_fill_super succeeded */
 941                 client_common_put_super(sb);
 942         }
 943
 944         next = 0;
 945         while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)))
 946                 class_manual_cleanup(obd);
 947
 948         if (sbi->ll_flags & LL_SBI_VERBOSE)
 949                 LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : "");
 950
 951         if (profilenm)
 952                 class_del_profile(profilenm);
 953
 954         if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
 955                 bdi_destroy(&lsi->lsi_bdi);
 956                 lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
 957         }
 958
 959         ll_free_sbi(sb);
 960         lsi->lsi_llsbi = NULL;
 961
 962         lustre_common_put_super(sb);
 963
 964         cl_env_cache_purge(~0);
 965
 966         module_put(THIS_MODULE);
 967 } /* client_put_super */
 968
 969 struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock)
 970 {
 971         struct inode *inode = NULL;
 972
 973         /* NOTE: we depend on atomic igrab() -bzzz */
 974         lock_res_and_lock(lock);
 975         if (lock->l_resource->lr_lvb_inode) {
 976                 struct ll_inode_info *lli;
 977
 978                 lli = ll_i2info(lock->l_resource->lr_lvb_inode);
 979                 if (lli->lli_inode_magic == LLI_INODE_MAGIC) {
 980                         inode = igrab(lock->l_resource->lr_lvb_inode);
 981                 } else {
 982                         inode = lock->l_resource->lr_lvb_inode;
 983                         LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ?  D_INFO :
 984                                          D_WARNING, lock, "lr_lvb_inode %p is bogus: magic %08x",
 985                                          lock->l_resource->lr_lvb_inode,
 986                                          lli->lli_inode_magic);
 987                         inode = NULL;
 988                 }
 989         }
 990         unlock_res_and_lock(lock);
 991         return inode;
 992 }
 993
 994 void ll_clear_inode(struct inode *inode)
 995 {
 996         struct ll_inode_info *lli = ll_i2info(inode);
 997         struct ll_sb_info *sbi = ll_i2sbi(inode);
 998
 999         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
1000                PFID(ll_inode2fid(inode)), inode);
1001
1002         if (S_ISDIR(inode->i_mode)) {
1003                 /* these should have been cleared in ll_file_release */
1004                 LASSERT(!lli->lli_opendir_key);
1005                 LASSERT(!lli->lli_sai);
1006                 LASSERT(lli->lli_opendir_pid == 0);
1007         }
1008
1009         spin_lock(&lli->lli_lock);
1010         ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK;
1011         spin_unlock(&lli->lli_lock);
1012         md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode));
1013
1014         LASSERT(!lli->lli_open_fd_write_count);
1015         LASSERT(!lli->lli_open_fd_read_count);
1016         LASSERT(!lli->lli_open_fd_exec_count);
1017
1018         if (lli->lli_mds_write_och)
1019                 ll_md_real_close(inode, FMODE_WRITE);
1020         if (lli->lli_mds_exec_och)
1021                 ll_md_real_close(inode, FMODE_EXEC);
1022         if (lli->lli_mds_read_och)
1023                 ll_md_real_close(inode, FMODE_READ);
1024
1025         if (S_ISLNK(inode->i_mode)) {
1026                 kfree(lli->lli_symlink_name);
1027                 lli->lli_symlink_name = NULL;
1028         }
1029
1030         ll_xattr_cache_destroy(inode);
1031
1032 #ifdef CONFIG_FS_POSIX_ACL
1033         if (lli->lli_posix_acl) {
1034                 LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
1035                 posix_acl_release(lli->lli_posix_acl);
1036                 lli->lli_posix_acl = NULL;
1037         }
1038 #endif
1039         lli->lli_inode_magic = LLI_INODE_DEAD;
1040
1041         if (!S_ISDIR(inode->i_mode))
1042                 LASSERT(list_empty(&lli->lli_agl_list));
1043
1044         /*
1045          * XXX This has to be done before lsm is freed below, because
1046          * cl_object still uses inode lsm.
1047          */
1048         cl_inode_fini(inode);
1049         lli->lli_has_smd = false;
1050 }
1051
1052 #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
1053
1054 static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data,
1055                          struct md_open_data **mod)
1056 {
1057         struct lustre_md md;
1058         struct inode *inode = d_inode(dentry);
1059         struct ll_sb_info *sbi = ll_i2sbi(inode);
1060         struct ptlrpc_request *request = NULL;
1061         int rc, ia_valid;
1062
1063         op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0,
1064                                      LUSTRE_OPC_ANY, NULL);
1065         if (IS_ERR(op_data))
1066                 return PTR_ERR(op_data);
1067
1068         rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, NULL, 0,
1069                         &request, mod);
1070         if (rc) {
1071                 ptlrpc_req_finished(request);
1072                 if (rc == -ENOENT) {
1073                         clear_nlink(inode);
1074                         /* Unlinked special device node? Or just a race?
1075                          * Pretend we did everything.
1076                          */
1077                         if (!S_ISREG(inode->i_mode) &&
1078                             !S_ISDIR(inode->i_mode)) {
1079                                 ia_valid = op_data->op_attr.ia_valid;
1080                                 op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS;
1081                                 rc = simple_setattr(dentry, &op_data->op_attr);
1082                                 op_data->op_attr.ia_valid = ia_valid;
1083                         }
1084                 } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) {
1085                         CERROR("md_setattr fails: rc = %d\n", rc);
1086                 }
1087                 return rc;
1088         }
1089
1090         rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp,
1091                               sbi->ll_md_exp, &md);
1092         if (rc) {
1093                 ptlrpc_req_finished(request);
1094                 return rc;
1095         }
1096
1097         ia_valid = op_data->op_attr.ia_valid;
1098         /* inode size will be in cl_setattr_ost, can't do it now since dirty
1099          * cache is not cleared yet.
1100          */
1101         op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
1102         rc = simple_setattr(dentry, &op_data->op_attr);
1103         op_data->op_attr.ia_valid = ia_valid;
1104
1105         /* Extract epoch data if obtained. */
1106         op_data->op_handle = md.body->handle;
1107         op_data->op_ioepoch = md.body->ioepoch;
1108
1109         ll_update_inode(inode, &md);
1110         ptlrpc_req_finished(request);
1111
1112         return rc;
1113 }
1114
1115 /* Close IO epoch and send Size-on-MDS attribute update. */
1116 static int ll_setattr_done_writing(struct inode *inode,
1117                                    struct md_op_data *op_data,
1118                                    struct md_open_data *mod)
1119 {
1120         struct ll_inode_info *lli = ll_i2info(inode);
1121         int rc = 0;
1122
1123         if (!S_ISREG(inode->i_mode))
1124                 return 0;
1125
1126         CDEBUG(D_INODE, "Epoch %llu closed on "DFID" for truncate\n",
1127                op_data->op_ioepoch, PFID(&lli->lli_fid));
1128
1129         op_data->op_flags = MF_EPOCH_CLOSE;
1130         ll_done_writing_attr(inode, op_data);
1131         ll_pack_inode2opdata(inode, op_data, NULL);
1132
1133         rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, mod);
1134         if (rc == -EAGAIN)
1135                 /* MDS has instructed us to obtain Size-on-MDS attribute
1136                  * from OSTs and send setattr to back to MDS.
1137                  */
1138                 rc = ll_som_update(inode, op_data);
1139         else if (rc) {
1140                 CERROR("%s: inode "DFID" mdc truncate failed: rc = %d\n",
1141                       ll_i2sbi(inode)->ll_md_exp->exp_obd->obd_name,
1142                       PFID(ll_inode2fid(inode)), rc);
1143         }
1144         return rc;
1145 }
1146
1147 /* If this inode has objects allocated to it (lsm != NULL), then the OST
1148  * object(s) determine the file size and mtime.  Otherwise, the MDS will
1149  * keep these values until such a time that objects are allocated for it.
1150  * We do the MDS operations first, as it is checking permissions for us.
1151  * We don't to the MDS RPC if there is nothing that we want to store there,
1152  * otherwise there is no harm in updating mtime/atime on the MDS if we are
1153  * going to do an RPC anyways.
1154  *
1155  * If we are doing a truncate, we will send the mtime and ctime updates
1156  * to the OST with the punch RPC, otherwise we do an explicit setattr RPC.
1157  * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE
1158  * at the same time.
1159  *
1160  * In case of HSMimport, we only set attr on MDS.
1161  */
1162 int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
1163 {
1164         struct inode *inode = d_inode(dentry);
1165         struct ll_inode_info *lli = ll_i2info(inode);
1166         struct md_op_data *op_data = NULL;
1167         struct md_open_data *mod = NULL;
1168         bool file_is_released = false;
1169         int rc = 0, rc1 = 0;
1170
1171         CDEBUG(D_VFSTRACE, "%s: setattr inode "DFID"(%p) from %llu to %llu, valid %x, hsm_import %d\n",
1172                ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid), inode,
1173                i_size_read(inode), attr->ia_size, attr->ia_valid, hsm_import);
1174
1175         if (attr->ia_valid & ATTR_SIZE) {
1176                 /* Check new size against VFS/VM file size limit and rlimit */
1177                 rc = inode_newsize_ok(inode, attr->ia_size);
1178                 if (rc)
1179                         return rc;
1180
1181                 /* The maximum Lustre file size is variable, based on the
1182                  * OST maximum object size and number of stripes.  This
1183                  * needs another check in addition to the VFS check above.
1184                  */
1185                 if (attr->ia_size > ll_file_maxbytes(inode)) {
1186                         CDEBUG(D_INODE, "file "DFID" too large %llu > %llu\n",
1187                                PFID(&lli->lli_fid), attr->ia_size,
1188                                ll_file_maxbytes(inode));
1189                         return -EFBIG;
1190                 }
1191
1192                 attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
1193         }
1194
1195         /* POSIX: check before ATTR_*TIME_SET set (from setattr_prepare) */
1196         if (attr->ia_valid & TIMES_SET_FLAGS) {
1197                 if ((!uid_eq(current_fsuid(), inode->i_uid)) &&
1198                     !capable(CFS_CAP_FOWNER))
1199                         return -EPERM;
1200         }
1201
1202         /* We mark all of the fields "set" so MDS/OST does not re-set them */
1203         if (attr->ia_valid & ATTR_CTIME) {
1204                 attr->ia_ctime = CURRENT_TIME;
1205                 attr->ia_valid |= ATTR_CTIME_SET;
1206         }
1207         if (!(attr->ia_valid & ATTR_ATIME_SET) &&
1208             (attr->ia_valid & ATTR_ATIME)) {
1209                 attr->ia_atime = CURRENT_TIME;
1210                 attr->ia_valid |= ATTR_ATIME_SET;
1211         }
1212         if (!(attr->ia_valid & ATTR_MTIME_SET) &&
1213             (attr->ia_valid & ATTR_MTIME)) {
1214                 attr->ia_mtime = CURRENT_TIME;
1215                 attr->ia_valid |= ATTR_MTIME_SET;
1216         }
1217
1218         if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
1219                 CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %llu\n",
1220                        LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
1221                        (s64)ktime_get_real_seconds());
1222
1223         /* We always do an MDS RPC, even if we're only changing the size;
1224          * only the MDS knows whether truncate() should fail with -ETXTBUSY
1225          */
1226
1227         op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
1228         if (!op_data)
1229                 return -ENOMEM;
1230
1231         if (!S_ISDIR(inode->i_mode))
1232                 inode_unlock(inode);
1233
1234         /* truncate on a released file must failed with -ENODATA,
1235          * so size must not be set on MDS for released file
1236          * but other attributes must be set
1237          */
1238         if (S_ISREG(inode->i_mode)) {
1239                 struct lov_stripe_md *lsm;
1240                 __u32 gen;
1241
1242                 ll_layout_refresh(inode, &gen);
1243                 lsm = ccc_inode_lsm_get(inode);
1244                 if (lsm && lsm->lsm_pattern & LOV_PATTERN_F_RELEASED)
1245                         file_is_released = true;
1246                 ccc_inode_lsm_put(inode, lsm);
1247
1248                 if (!hsm_import && attr->ia_valid & ATTR_SIZE) {
1249                         if (file_is_released) {
1250                                 rc = ll_layout_restore(inode, 0, attr->ia_size);
1251                                 if (rc < 0)
1252                                         goto out;
1253
1254                                 file_is_released = false;
1255                                 ll_layout_refresh(inode, &gen);
1256                         }
1257
1258                         /*
1259                          * If we are changing file size, file content is
1260                          * modified, flag it.
1261                          */
1262                         attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
1263                         spin_lock(&lli->lli_lock);
1264                         lli->lli_flags |= LLIF_DATA_MODIFIED;
1265                         spin_unlock(&lli->lli_lock);
1266                         op_data->op_bias |= MDS_DATA_MODIFIED;
1267                 }
1268         }
1269
1270         memcpy(&op_data->op_attr, attr, sizeof(*attr));
1271
1272         /* Open epoch for truncate. */
1273         if (exp_connect_som(ll_i2mdexp(inode)) && !hsm_import &&
1274             (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET)))
1275                 op_data->op_flags = MF_EPOCH_OPEN;
1276
1277         rc = ll_md_setattr(dentry, op_data, &mod);
1278         if (rc)
1279                 goto out;
1280
1281         /* RPC to MDT is sent, cancel data modification flag */
1282         if (op_data->op_bias & MDS_DATA_MODIFIED) {
1283                 spin_lock(&lli->lli_lock);
1284                 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
1285                 spin_unlock(&lli->lli_lock);
1286         }
1287
1288         ll_ioepoch_open(lli, op_data->op_ioepoch);
1289         if (!S_ISREG(inode->i_mode) || file_is_released) {
1290                 rc = 0;
1291                 goto out;
1292         }
1293
1294         if (attr->ia_valid & (ATTR_SIZE |
1295                               ATTR_ATIME | ATTR_ATIME_SET |
1296                               ATTR_MTIME | ATTR_MTIME_SET)) {
1297                 /* For truncate and utimes sending attributes to OSTs, setting
1298                  * mtime/atime to the past will be performed under PW [0:EOF]
1299                  * extent lock (new_size:EOF for truncate).  It may seem
1300                  * excessive to send mtime/atime updates to OSTs when not
1301                  * setting times to past, but it is necessary due to possible
1302                  * time de-synchronization between MDT inode and OST objects
1303                  */
1304                 if (attr->ia_valid & ATTR_SIZE)
1305                         down_write(&lli->lli_trunc_sem);
1306                 rc = cl_setattr_ost(inode, attr);
1307                 if (attr->ia_valid & ATTR_SIZE)
1308                         up_write(&lli->lli_trunc_sem);
1309         }
1310 out:
1311         if (op_data->op_ioepoch) {
1312                 rc1 = ll_setattr_done_writing(inode, op_data, mod);
1313                 if (!rc)
1314                         rc = rc1;
1315         }
1316         ll_finish_md_op_data(op_data);
1317
1318         if (!S_ISDIR(inode->i_mode)) {
1319                 inode_lock(inode);
1320                 if ((attr->ia_valid & ATTR_SIZE) && !hsm_import)
1321                         inode_dio_wait(inode);
1322         }
1323
1324         ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ?
1325                         LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1);
1326
1327         return rc;
1328 }
1329
1330 int ll_setattr(struct dentry *de, struct iattr *attr)
1331 {
1332         int mode = d_inode(de)->i_mode;
1333
1334         if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) ==
1335                               (ATTR_CTIME|ATTR_SIZE|ATTR_MODE))
1336                 attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
1337
1338         if (((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) ==
1339                                (ATTR_SIZE|ATTR_MODE)) &&
1340             (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) ||
1341              (((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
1342               !(attr->ia_mode & S_ISGID))))
1343                 attr->ia_valid |= ATTR_FORCE;
1344
1345         if ((attr->ia_valid & ATTR_MODE) &&
1346             (mode & S_ISUID) &&
1347             !(attr->ia_mode & S_ISUID) &&
1348             !(attr->ia_valid & ATTR_KILL_SUID))
1349                 attr->ia_valid |= ATTR_KILL_SUID;
1350
1351         if ((attr->ia_valid & ATTR_MODE) &&
1352             ((mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP)) &&
1353             !(attr->ia_mode & S_ISGID) &&
1354             !(attr->ia_valid & ATTR_KILL_SGID))
1355                 attr->ia_valid |= ATTR_KILL_SGID;
1356
1357         return ll_setattr_raw(de, attr, false);
1358 }
1359
1360 int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
1361                        __u64 max_age, __u32 flags)
1362 {
1363         struct ll_sb_info *sbi = ll_s2sbi(sb);
1364         struct obd_statfs obd_osfs;
1365         int rc;
1366
1367         rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags);
1368         if (rc) {
1369                 CERROR("md_statfs fails: rc = %d\n", rc);
1370                 return rc;
1371         }
1372
1373         osfs->os_type = sb->s_magic;
1374
1375         CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n",
1376                osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,
1377                osfs->os_files);
1378
1379         if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
1380                 flags |= OBD_STATFS_NODELAY;
1381
1382         rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags);
1383         if (rc) {
1384                 CERROR("obd_statfs fails: rc = %d\n", rc);
1385                 return rc;
1386         }
1387
1388         CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n",
1389                obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree,
1390                obd_osfs.os_files);
1391
1392         osfs->os_bsize = obd_osfs.os_bsize;
1393         osfs->os_blocks = obd_osfs.os_blocks;
1394         osfs->os_bfree = obd_osfs.os_bfree;
1395         osfs->os_bavail = obd_osfs.os_bavail;
1396
1397         /* If we don't have as many objects free on the OST as inodes
1398          * on the MDS, we reduce the total number of inodes to
1399          * compensate, so that the "inodes in use" number is correct.
1400          */
1401         if (obd_osfs.os_ffree < osfs->os_ffree) {
1402                 osfs->os_files = (osfs->os_files - osfs->os_ffree) +
1403                         obd_osfs.os_ffree;
1404                 osfs->os_ffree = obd_osfs.os_ffree;
1405         }
1406
1407         return rc;
1408 }
1409
1410 int ll_statfs(struct dentry *de, struct kstatfs *sfs)
1411 {
1412         struct super_block *sb = de->d_sb;
1413         struct obd_statfs osfs;
1414         int rc;
1415
1416         CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64());
1417         ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1);
1418
1419         /* Some amount of caching on the client is allowed */
1420         rc = ll_statfs_internal(sb, &osfs,
1421                                 cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
1422                                 0);
1423         if (rc)
1424                 return rc;
1425
1426         statfs_unpack(sfs, &osfs);
1427
1428         /* We need to downshift for all 32-bit kernels, because we can't
1429          * tell if the kernel is being called via sys_statfs64() or not.
1430          * Stop before overflowing f_bsize - in which case it is better
1431          * to just risk EOVERFLOW if caller is using old sys_statfs().
1432          */
1433         if (sizeof(long) < 8) {
1434                 while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) {
1435                         sfs->f_bsize <<= 1;
1436
1437                         osfs.os_blocks >>= 1;
1438                         osfs.os_bfree >>= 1;
1439                         osfs.os_bavail >>= 1;
1440                 }
1441         }
1442
1443         sfs->f_blocks = osfs.os_blocks;
1444         sfs->f_bfree = osfs.os_bfree;
1445         sfs->f_bavail = osfs.os_bavail;
1446         sfs->f_fsid = ll_s2sbi(sb)->ll_fsid;
1447         return 0;
1448 }
1449
1450 void ll_inode_size_lock(struct inode *inode)
1451 {
1452         struct ll_inode_info *lli;
1453
1454         LASSERT(!S_ISDIR(inode->i_mode));
1455
1456         lli = ll_i2info(inode);
1457         mutex_lock(&lli->lli_size_mutex);
1458 }
1459
1460 void ll_inode_size_unlock(struct inode *inode)
1461 {
1462         struct ll_inode_info *lli;
1463
1464         lli = ll_i2info(inode);
1465         mutex_unlock(&lli->lli_size_mutex);
1466 }
1467
1468 void ll_update_inode(struct inode *inode, struct lustre_md *md)
1469 {
1470         struct ll_inode_info *lli = ll_i2info(inode);
1471         struct mdt_body *body = md->body;
1472         struct lov_stripe_md *lsm = md->lsm;
1473         struct ll_sb_info *sbi = ll_i2sbi(inode);
1474
1475         LASSERT((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
1476         if (lsm) {
1477                 if (!lli->lli_has_smd &&
1478                     !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
1479                         cl_file_inode_init(inode, md);
1480
1481                 lli->lli_maxbytes = lsm->lsm_maxbytes;
1482                 if (lli->lli_maxbytes > MAX_LFS_FILESIZE)
1483                         lli->lli_maxbytes = MAX_LFS_FILESIZE;
1484         }
1485
1486 #ifdef CONFIG_FS_POSIX_ACL
1487         if (body->valid & OBD_MD_FLACL) {
1488                 spin_lock(&lli->lli_lock);
1489                 if (lli->lli_posix_acl)
1490                         posix_acl_release(lli->lli_posix_acl);
1491                 lli->lli_posix_acl = md->posix_acl;
1492                 spin_unlock(&lli->lli_lock);
1493         }
1494 #endif
1495         inode->i_ino = cl_fid_build_ino(&body->fid1,
1496                                         sbi->ll_flags & LL_SBI_32BIT_API);
1497         inode->i_generation = cl_fid_build_gen(&body->fid1);
1498
1499         if (body->valid & OBD_MD_FLATIME) {
1500                 if (body->atime > LTIME_S(inode->i_atime))
1501                         LTIME_S(inode->i_atime) = body->atime;
1502                 lli->lli_atime = body->atime;
1503         }
1504         if (body->valid & OBD_MD_FLMTIME) {
1505                 if (body->mtime > LTIME_S(inode->i_mtime)) {
1506                         CDEBUG(D_INODE, "setting ino %lu mtime from %lu to %llu\n",
1507                                inode->i_ino, LTIME_S(inode->i_mtime),
1508                                body->mtime);
1509                         LTIME_S(inode->i_mtime) = body->mtime;
1510                 }
1511                 lli->lli_mtime = body->mtime;
1512         }
1513         if (body->valid & OBD_MD_FLCTIME) {
1514                 if (body->ctime > LTIME_S(inode->i_ctime))
1515                         LTIME_S(inode->i_ctime) = body->ctime;
1516                 lli->lli_ctime = body->ctime;
1517         }
1518         if (body->valid & OBD_MD_FLMODE)
1519                 inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
1520         if (body->valid & OBD_MD_FLTYPE)
1521                 inode->i_mode = (inode->i_mode & ~S_IFMT)|(body->mode & S_IFMT);
1522         LASSERT(inode->i_mode != 0);
1523         if (S_ISREG(inode->i_mode))
1524                 inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1,
1525                                        LL_MAX_BLKSIZE_BITS);
1526         else
1527                 inode->i_blkbits = inode->i_sb->s_blocksize_bits;
1528         if (body->valid & OBD_MD_FLUID)
1529                 inode->i_uid = make_kuid(&init_user_ns, body->uid);
1530         if (body->valid & OBD_MD_FLGID)
1531                 inode->i_gid = make_kgid(&init_user_ns, body->gid);
1532         if (body->valid & OBD_MD_FLFLAGS)
1533                 inode->i_flags = ll_ext_to_inode_flags(body->flags);
1534         if (body->valid & OBD_MD_FLNLINK)
1535                 set_nlink(inode, body->nlink);
1536         if (body->valid & OBD_MD_FLRDEV)
1537                 inode->i_rdev = old_decode_dev(body->rdev);
1538
1539         if (body->valid & OBD_MD_FLID) {
1540                 /* FID shouldn't be changed! */
1541                 if (fid_is_sane(&lli->lli_fid)) {
1542                         LASSERTF(lu_fid_eq(&lli->lli_fid, &body->fid1),
1543                                  "Trying to change FID "DFID" to the "DFID", inode "DFID"(%p)\n",
1544                                  PFID(&lli->lli_fid), PFID(&body->fid1),
1545                                  PFID(ll_inode2fid(inode)), inode);
1546                 } else {
1547                         lli->lli_fid = body->fid1;
1548                 }
1549         }
1550
1551         LASSERT(fid_seq(&lli->lli_fid) != 0);
1552
1553         if (body->valid & OBD_MD_FLSIZE) {
1554                 if (exp_connect_som(ll_i2mdexp(inode)) &&
1555                     S_ISREG(inode->i_mode)) {
1556                         struct lustre_handle lockh;
1557                         enum ldlm_mode mode;
1558
1559                         /* As it is possible a blocking ast has been processed
1560                          * by this time, we need to check there is an UPDATE
1561                          * lock on the client and set LLIF_MDS_SIZE_LOCK holding
1562                          * it.
1563                          */
1564                         mode = ll_take_md_lock(inode, MDS_INODELOCK_UPDATE,
1565                                                &lockh, LDLM_FL_CBPENDING,
1566                                                LCK_CR | LCK_CW |
1567                                                LCK_PR | LCK_PW);
1568                         if (mode) {
1569                                 if (lli->lli_flags & (LLIF_DONE_WRITING |
1570                                                       LLIF_EPOCH_PENDING |
1571                                                       LLIF_SOM_DIRTY)) {
1572                                         CERROR("%s: inode "DFID" flags %u still has size authority! do not trust the size got from MDS\n",
1573                                                sbi->ll_md_exp->exp_obd->obd_name,
1574                                                PFID(ll_inode2fid(inode)),
1575                                                lli->lli_flags);
1576                                 } else {
1577                                         /* Use old size assignment to avoid
1578                                          * deadlock bz14138 & bz14326
1579                                          */
1580                                         i_size_write(inode, body->size);
1581                                         spin_lock(&lli->lli_lock);
1582                                         lli->lli_flags |= LLIF_MDS_SIZE_LOCK;
1583                                         spin_unlock(&lli->lli_lock);
1584                                 }
1585                                 ldlm_lock_decref(&lockh, mode);
1586                         }
1587                 } else {
1588                         /* Use old size assignment to avoid
1589                          * deadlock bz14138 & bz14326
1590                          */
1591                         i_size_write(inode, body->size);
1592
1593                         CDEBUG(D_VFSTRACE, "inode=%lu, updating i_size %llu\n",
1594                                inode->i_ino, (unsigned long long)body->size);
1595                 }
1596
1597                 if (body->valid & OBD_MD_FLBLOCKS)
1598                         inode->i_blocks = body->blocks;
1599         }
1600
1601         if (body->valid & OBD_MD_TSTATE) {
1602                 if (body->t_state & MS_RESTORE)
1603                         lli->lli_flags |= LLIF_FILE_RESTORING;
1604         }
1605 }
1606
1607 void ll_read_inode2(struct inode *inode, void *opaque)
1608 {
1609         struct lustre_md *md = opaque;
1610         struct ll_inode_info *lli = ll_i2info(inode);
1611
1612         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
1613                PFID(&lli->lli_fid), inode);
1614
1615         LASSERT(!lli->lli_has_smd);
1616
1617         /* Core attributes from the MDS first.  This is a new inode, and
1618          * the VFS doesn't zero times in the core inode so we have to do
1619          * it ourselves.  They will be overwritten by either MDS or OST
1620          * attributes - we just need to make sure they aren't newer.
1621          */
1622         LTIME_S(inode->i_mtime) = 0;
1623         LTIME_S(inode->i_atime) = 0;
1624         LTIME_S(inode->i_ctime) = 0;
1625         inode->i_rdev = 0;
1626         ll_update_inode(inode, md);
1627
1628         /* OIDEBUG(inode); */
1629
1630         if (S_ISREG(inode->i_mode)) {
1631                 struct ll_sb_info *sbi = ll_i2sbi(inode);
1632
1633                 inode->i_op = &ll_file_inode_operations;
1634                 inode->i_fop = sbi->ll_fop;
1635                 inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops;
1636         } else if (S_ISDIR(inode->i_mode)) {
1637                 inode->i_op = &ll_dir_inode_operations;
1638                 inode->i_fop = &ll_dir_operations;
1639         } else if (S_ISLNK(inode->i_mode)) {
1640                 inode->i_op = &ll_fast_symlink_inode_operations;
1641         } else {
1642                 inode->i_op = &ll_special_inode_operations;
1643
1644                 init_special_inode(inode, inode->i_mode,
1645                                    inode->i_rdev);
1646         }
1647 }
1648
1649 void ll_delete_inode(struct inode *inode)
1650 {
1651         struct ll_inode_info *lli = ll_i2info(inode);
1652
1653         if (S_ISREG(inode->i_mode) && lli->lli_clob)
1654                 /* discard all dirty pages before truncating them, required by
1655                  * osc_extent implementation at LU-1030.
1656                  */
1657                 cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
1658                                    CL_FSYNC_DISCARD, 1);
1659
1660         truncate_inode_pages_final(&inode->i_data);
1661
1662         /* Workaround for LU-118 */
1663         if (inode->i_data.nrpages) {
1664                 spin_lock_irq(&inode->i_data.tree_lock);
1665                 spin_unlock_irq(&inode->i_data.tree_lock);
1666                 LASSERTF(inode->i_data.nrpages == 0,
1667                          "inode="DFID"(%p) nrpages=%lu, see http://jira.whamcloud.com/browse/LU-118\n",
1668                          PFID(ll_inode2fid(inode)), inode,
1669                          inode->i_data.nrpages);
1670         }
1671         /* Workaround end */
1672
1673         ll_clear_inode(inode);
1674         clear_inode(inode);
1675 }
1676
1677 int ll_iocontrol(struct inode *inode, struct file *file,
1678                  unsigned int cmd, unsigned long arg)
1679 {
1680         struct ll_sb_info *sbi = ll_i2sbi(inode);
1681         struct ptlrpc_request *req = NULL;
1682         int rc, flags = 0;
1683
1684         switch (cmd) {
1685         case FSFILT_IOC_GETFLAGS: {
1686                 struct mdt_body *body;
1687                 struct md_op_data *op_data;
1688
1689                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
1690                                              0, 0, LUSTRE_OPC_ANY,
1691                                              NULL);
1692                 if (IS_ERR(op_data))
1693                         return PTR_ERR(op_data);
1694
1695                 op_data->op_valid = OBD_MD_FLFLAGS;
1696                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
1697                 ll_finish_md_op_data(op_data);
1698                 if (rc) {
1699                         CERROR("%s: failure inode "DFID": rc = %d\n",
1700                                sbi->ll_md_exp->exp_obd->obd_name,
1701                                PFID(ll_inode2fid(inode)), rc);
1702                         return -abs(rc);
1703                 }
1704
1705                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1706
1707                 flags = body->flags;
1708
1709                 ptlrpc_req_finished(req);
1710
1711                 return put_user(flags, (int __user *)arg);
1712         }
1713         case FSFILT_IOC_SETFLAGS: {
1714                 struct lov_stripe_md *lsm;
1715                 struct obd_info oinfo = { };
1716                 struct md_op_data *op_data;
1717
1718                 if (get_user(flags, (int __user *)arg))
1719                         return -EFAULT;
1720
1721                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1722                                              LUSTRE_OPC_ANY, NULL);
1723                 if (IS_ERR(op_data))
1724                         return PTR_ERR(op_data);
1725
1726                 op_data->op_attr_flags = flags;
1727                 op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
1728                 rc = md_setattr(sbi->ll_md_exp, op_data,
1729                                 NULL, 0, NULL, 0, &req, NULL);
1730                 ll_finish_md_op_data(op_data);
1731                 ptlrpc_req_finished(req);
1732                 if (rc)
1733                         return rc;
1734
1735                 inode->i_flags = ll_ext_to_inode_flags(flags);
1736
1737                 lsm = ccc_inode_lsm_get(inode);
1738                 if (!lsm_has_objects(lsm)) {
1739                         ccc_inode_lsm_put(inode, lsm);
1740                         return 0;
1741                 }
1742
1743                 oinfo.oi_oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS);
1744                 if (!oinfo.oi_oa) {
1745                         ccc_inode_lsm_put(inode, lsm);
1746                         return -ENOMEM;
1747                 }
1748                 oinfo.oi_md = lsm;
1749                 oinfo.oi_oa->o_oi = lsm->lsm_oi;
1750                 oinfo.oi_oa->o_flags = flags;
1751                 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS |
1752                                        OBD_MD_FLGROUP;
1753                 obdo_set_parent_fid(oinfo.oi_oa, &ll_i2info(inode)->lli_fid);
1754                 rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL);
1755                 kmem_cache_free(obdo_cachep, oinfo.oi_oa);
1756                 ccc_inode_lsm_put(inode, lsm);
1757
1758                 if (rc && rc != -EPERM && rc != -EACCES)
1759                         CERROR("osc_setattr_async fails: rc = %d\n", rc);
1760
1761                 return rc;
1762         }
1763         default:
1764                 return -ENOSYS;
1765         }
1766
1767         return 0;
1768 }
1769
1770 int ll_flush_ctx(struct inode *inode)
1771 {
1772         struct ll_sb_info  *sbi = ll_i2sbi(inode);
1773
1774         CDEBUG(D_SEC, "flush context for user %d\n",
1775                from_kuid(&init_user_ns, current_uid()));
1776
1777         obd_set_info_async(NULL, sbi->ll_md_exp,
1778                            sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
1779                            0, NULL, NULL);
1780         obd_set_info_async(NULL, sbi->ll_dt_exp,
1781                            sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX,
1782                            0, NULL, NULL);
1783         return 0;
1784 }
1785
1786 /* umount -f client means force down, don't save state */
1787 void ll_umount_begin(struct super_block *sb)
1788 {
1789         struct ll_sb_info *sbi = ll_s2sbi(sb);
1790         struct obd_device *obd;
1791         struct obd_ioctl_data *ioc_data;
1792
1793         CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
1794                sb->s_count, atomic_read(&sb->s_active));
1795
1796         obd = class_exp2obd(sbi->ll_md_exp);
1797         if (!obd) {
1798                 CERROR("Invalid MDC connection handle %#llx\n",
1799                        sbi->ll_md_exp->exp_handle.h_cookie);
1800                 return;
1801         }
1802         obd->obd_force = 1;
1803
1804         obd = class_exp2obd(sbi->ll_dt_exp);
1805         if (!obd) {
1806                 CERROR("Invalid LOV connection handle %#llx\n",
1807                        sbi->ll_dt_exp->exp_handle.h_cookie);
1808                 return;
1809         }
1810         obd->obd_force = 1;
1811
1812         ioc_data = kzalloc(sizeof(*ioc_data), GFP_NOFS);
1813         if (ioc_data) {
1814                 obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp,
1815                               sizeof(*ioc_data), ioc_data, NULL);
1816
1817                 obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp,
1818                               sizeof(*ioc_data), ioc_data, NULL);
1819
1820                 kfree(ioc_data);
1821         }
1822
1823         /* Really, we'd like to wait until there are no requests outstanding,
1824          * and then continue.  For now, we just invalidate the requests,
1825          * schedule() and sleep one second if needed, and hope.
1826          */
1827         schedule();
1828 }
1829
1830 int ll_remount_fs(struct super_block *sb, int *flags, char *data)
1831 {
1832         struct ll_sb_info *sbi = ll_s2sbi(sb);
1833         char *profilenm = get_profile_name(sb);
1834         int err;
1835         __u32 read_only;
1836
1837         if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
1838                 read_only = *flags & MS_RDONLY;
1839                 err = obd_set_info_async(NULL, sbi->ll_md_exp,
1840                                          sizeof(KEY_READ_ONLY),
1841                                          KEY_READ_ONLY, sizeof(read_only),
1842                                          &read_only, NULL);
1843                 if (err) {
1844                         LCONSOLE_WARN("Failed to remount %s %s (%d)\n",
1845                                       profilenm, read_only ?
1846                                       "read-only" : "read-write", err);
1847                         return err;
1848                 }
1849
1850                 if (read_only)
1851                         sb->s_flags |= MS_RDONLY;
1852                 else
1853                         sb->s_flags &= ~MS_RDONLY;
1854
1855                 if (sbi->ll_flags & LL_SBI_VERBOSE)
1856                         LCONSOLE_WARN("Remounted %s %s\n", profilenm,
1857                                       read_only ?  "read-only" : "read-write");
1858         }
1859         return 0;
1860 }
1861
1862 /**
1863  * Cleanup the open handle that is cached on MDT-side.
1864  *
1865  * For open case, the client side open handling thread may hit error
1866  * after the MDT grant the open. Under such case, the client should
1867  * send close RPC to the MDT as cleanup; otherwise, the open handle
1868  * on the MDT will be leaked there until the client umount or evicted.
1869  *
1870  * In further, if someone unlinked the file, because the open handle
1871  * holds the reference on such file/object, then it will block the
1872  * subsequent threads that want to locate such object via FID.
1873  *
1874  * \param[in] sb        super block for this file-system
1875  * \param[in] open_req  pointer to the original open request
1876  */
1877 void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req)
1878 {
1879         struct mdt_body                 *body;
1880         struct md_op_data               *op_data;
1881         struct ptlrpc_request           *close_req = NULL;
1882         struct obd_export               *exp       = ll_s2sbi(sb)->ll_md_exp;
1883
1884         body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY);
1885         op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
1886         if (!op_data)
1887                 return;
1888
1889         op_data->op_fid1 = body->fid1;
1890         op_data->op_ioepoch = body->ioepoch;
1891         op_data->op_handle = body->handle;
1892         op_data->op_mod_time = get_seconds();
1893         md_close(exp, op_data, NULL, &close_req);
1894         ptlrpc_req_finished(close_req);
1895         ll_finish_md_op_data(op_data);
1896 }
1897
1898 int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
1899                   struct super_block *sb, struct lookup_intent *it)
1900 {
1901         struct ll_sb_info *sbi = NULL;
1902         struct lustre_md md = { NULL };
1903         int rc;
1904
1905         LASSERT(*inode || sb);
1906         sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
1907         rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp,
1908                               sbi->ll_md_exp, &md);
1909         if (rc)
1910                 goto cleanup;
1911
1912         if (*inode) {
1913                 ll_update_inode(*inode, &md);
1914         } else {
1915                 LASSERT(sb);
1916
1917                 /*
1918                  * At this point server returns to client's same fid as client
1919                  * generated for creating. So using ->fid1 is okay here.
1920                  */
1921                 if (!fid_is_sane(&md.body->fid1)) {
1922                         CERROR("%s: Fid is insane " DFID "\n",
1923                                ll_get_fsname(sb, NULL, 0),
1924                                PFID(&md.body->fid1));
1925                         rc = -EINVAL;
1926                         goto out;
1927                 }
1928
1929                 *inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1,
1930                                              sbi->ll_flags & LL_SBI_32BIT_API),
1931                                  &md);
1932                 if (!*inode) {
1933 #ifdef CONFIG_FS_POSIX_ACL
1934                         if (md.posix_acl) {
1935                                 posix_acl_release(md.posix_acl);
1936                                 md.posix_acl = NULL;
1937                         }
1938 #endif
1939                         rc = -ENOMEM;
1940                         CERROR("new_inode -fatal: rc %d\n", rc);
1941                         goto out;
1942                 }
1943         }
1944
1945         /* Handling piggyback layout lock.
1946          * Layout lock can be piggybacked by getattr and open request.
1947          * The lsm can be applied to inode only if it comes with a layout lock
1948          * otherwise correct layout may be overwritten, for example:
1949          * 1. proc1: mdt returns a lsm but not granting layout
1950          * 2. layout was changed by another client
1951          * 3. proc2: refresh layout and layout lock granted
1952          * 4. proc1: to apply a stale layout
1953          */
1954         if (it && it->it_lock_mode != 0) {
1955                 struct lustre_handle lockh;
1956                 struct ldlm_lock *lock;
1957
1958                 lockh.cookie = it->it_lock_handle;
1959                 lock = ldlm_handle2lock(&lockh);
1960                 LASSERT(lock);
1961                 if (ldlm_has_layout(lock)) {
1962                         struct cl_object_conf conf;
1963
1964                         memset(&conf, 0, sizeof(conf));
1965                         conf.coc_opc = OBJECT_CONF_SET;
1966                         conf.coc_inode = *inode;
1967                         conf.coc_lock = lock;
1968                         conf.u.coc_md = &md;
1969                         (void)ll_layout_conf(*inode, &conf);
1970                 }
1971                 LDLM_LOCK_PUT(lock);
1972         }
1973
1974 out:
1975         if (md.lsm)
1976                 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
1977         md_free_lustre_md(sbi->ll_md_exp, &md);
1978
1979 cleanup:
1980         if (rc != 0 && it && it->it_op & IT_OPEN)
1981                 ll_open_cleanup(sb ? sb : (*inode)->i_sb, req);
1982
1983         return rc;
1984 }
1985
1986 int ll_obd_statfs(struct inode *inode, void __user *arg)
1987 {
1988         struct ll_sb_info *sbi = NULL;
1989         struct obd_export *exp;
1990         char *buf = NULL;
1991         struct obd_ioctl_data *data = NULL;
1992         __u32 type;
1993         int len = 0, rc;
1994
1995         if (!inode) {
1996                 rc = -EINVAL;
1997                 goto out_statfs;
1998         }
1999
2000         sbi = ll_i2sbi(inode);
2001         if (!sbi) {
2002                 rc = -EINVAL;
2003                 goto out_statfs;
2004         }
2005
2006         rc = obd_ioctl_getdata(&buf, &len, arg);
2007         if (rc)
2008                 goto out_statfs;
2009
2010         data = (void *)buf;
2011         if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
2012             !data->ioc_pbuf1 || !data->ioc_pbuf2) {
2013                 rc = -EINVAL;
2014                 goto out_statfs;
2015         }
2016
2017         if (data->ioc_inllen1 != sizeof(__u32) ||
2018             data->ioc_inllen2 != sizeof(__u32) ||
2019             data->ioc_plen1 != sizeof(struct obd_statfs) ||
2020             data->ioc_plen2 != sizeof(struct obd_uuid)) {
2021                 rc = -EINVAL;
2022                 goto out_statfs;
2023         }
2024
2025         memcpy(&type, data->ioc_inlbuf1, sizeof(__u32));
2026         if (type & LL_STATFS_LMV) {
2027                 exp = sbi->ll_md_exp;
2028         } else if (type & LL_STATFS_LOV) {
2029                 exp = sbi->ll_dt_exp;
2030         } else {
2031                 rc = -ENODEV;
2032                 goto out_statfs;
2033         }
2034
2035         rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, NULL);
2036         if (rc)
2037                 goto out_statfs;
2038 out_statfs:
2039         if (buf)
2040                 obd_ioctl_freedata(buf, len);
2041         return rc;
2042 }
2043
2044 int ll_process_config(struct lustre_cfg *lcfg)
2045 {
2046         char *ptr;
2047         void *sb;
2048         struct lprocfs_static_vars lvars;
2049         unsigned long x;
2050         int rc = 0;
2051
2052         lprocfs_llite_init_vars(&lvars);
2053
2054         /* The instance name contains the sb: lustre-client-aacfe000 */
2055         ptr = strrchr(lustre_cfg_string(lcfg, 0), '-');
2056         if (!ptr || !*(++ptr))
2057                 return -EINVAL;
2058         rc = kstrtoul(ptr, 16, &x);
2059         if (rc != 0)
2060                 return -EINVAL;
2061         sb = (void *)x;
2062         /* This better be a real Lustre superblock! */
2063         LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == LMD_MAGIC);
2064
2065         /* Note we have not called client_common_fill_super yet, so
2066          * proc fns must be able to handle that!
2067          */
2068         rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars,
2069                                       lcfg, sb);
2070         if (rc > 0)
2071                 rc = 0;
2072         return rc;
2073 }
2074
2075 /* this function prepares md_op_data hint for passing ot down to MD stack. */
2076 struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data,
2077                                       struct inode *i1, struct inode *i2,
2078                                       const char *name, int namelen,
2079                                       int mode, __u32 opc, void *data)
2080 {
2081         if (namelen > ll_i2sbi(i1)->ll_namelen)
2082                 return ERR_PTR(-ENAMETOOLONG);
2083
2084         if (!op_data)
2085                 op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
2086
2087         if (!op_data)
2088                 return ERR_PTR(-ENOMEM);
2089
2090         ll_i2gids(op_data->op_suppgids, i1, i2);
2091         op_data->op_fid1 = *ll_inode2fid(i1);
2092
2093         if (i2)
2094                 op_data->op_fid2 = *ll_inode2fid(i2);
2095         else
2096                 fid_zero(&op_data->op_fid2);
2097
2098         op_data->op_name = name;
2099         op_data->op_namelen = namelen;
2100         op_data->op_mode = mode;
2101         op_data->op_mod_time = ktime_get_real_seconds();
2102         op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
2103         op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
2104         op_data->op_cap = cfs_curproc_cap_pack();
2105         op_data->op_bias = 0;
2106         op_data->op_cli_flags = 0;
2107         if ((opc == LUSTRE_OPC_CREATE) && name &&
2108             filename_is_volatile(name, namelen, NULL))
2109                 op_data->op_bias |= MDS_CREATE_VOLATILE;
2110         op_data->op_opc = opc;
2111         op_data->op_mds = 0;
2112         op_data->op_data = data;
2113
2114         /* If the file is being opened after mknod() (normally due to NFS)
2115          * try to use the default stripe data from parent directory for
2116          * allocating OST objects.  Try to pass the parent FID to MDS.
2117          */
2118         if (opc == LUSTRE_OPC_CREATE && i1 == i2 && S_ISREG(i2->i_mode) &&
2119             !ll_i2info(i2)->lli_has_smd) {
2120                 struct ll_inode_info *lli = ll_i2info(i2);
2121
2122                 spin_lock(&lli->lli_lock);
2123                 if (likely(!lli->lli_has_smd && !fid_is_zero(&lli->lli_pfid)))
2124                         op_data->op_fid1 = lli->lli_pfid;
2125                 spin_unlock(&lli->lli_lock);
2126         }
2127
2128         /* When called by ll_setattr_raw, file is i1. */
2129         if (ll_i2info(i1)->lli_flags & LLIF_DATA_MODIFIED)
2130                 op_data->op_bias |= MDS_DATA_MODIFIED;
2131
2132         return op_data;
2133 }
2134
2135 void ll_finish_md_op_data(struct md_op_data *op_data)
2136 {
2137         kfree(op_data);
2138 }
2139
2140 int ll_show_options(struct seq_file *seq, struct dentry *dentry)
2141 {
2142         struct ll_sb_info *sbi;
2143
2144         LASSERT(seq && dentry);
2145         sbi = ll_s2sbi(dentry->d_sb);
2146
2147         if (sbi->ll_flags & LL_SBI_NOLCK)
2148                 seq_puts(seq, ",nolock");
2149
2150         if (sbi->ll_flags & LL_SBI_FLOCK)
2151                 seq_puts(seq, ",flock");
2152
2153         if (sbi->ll_flags & LL_SBI_LOCALFLOCK)
2154                 seq_puts(seq, ",localflock");
2155
2156         if (sbi->ll_flags & LL_SBI_USER_XATTR)
2157                 seq_puts(seq, ",user_xattr");
2158
2159         if (sbi->ll_flags & LL_SBI_LAZYSTATFS)
2160                 seq_puts(seq, ",lazystatfs");
2161
2162         if (sbi->ll_flags & LL_SBI_USER_FID2PATH)
2163                 seq_puts(seq, ",user_fid2path");
2164
2165         return 0;
2166 }
2167
2168 /**
2169  * Get obd name by cmd, and copy out to user space
2170  */
2171 int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg)
2172 {
2173         struct ll_sb_info *sbi = ll_i2sbi(inode);
2174         struct obd_device *obd;
2175
2176         if (cmd == OBD_IOC_GETDTNAME)
2177                 obd = class_exp2obd(sbi->ll_dt_exp);
2178         else if (cmd == OBD_IOC_GETMDNAME)
2179                 obd = class_exp2obd(sbi->ll_md_exp);
2180         else
2181                 return -EINVAL;
2182
2183         if (!obd)
2184                 return -ENOENT;
2185
2186         if (copy_to_user((void __user *)arg, obd->obd_name,
2187                          strlen(obd->obd_name) + 1))
2188                 return -EFAULT;
2189
2190         return 0;
2191 }
2192
2193 /**
2194  * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the
2195  * fsname will be returned in this buffer; otherwise, a static buffer will be
2196  * used to store the fsname and returned to caller.
2197  */
2198 char *ll_get_fsname(struct super_block *sb, char *buf, int buflen)
2199 {
2200         static char fsname_static[MTI_NAME_MAXLEN];
2201         struct lustre_sb_info *lsi = s2lsi(sb);
2202         char *ptr;
2203         int len;
2204
2205         if (!buf) {
2206                 /* this means the caller wants to use static buffer
2207                  * and it doesn't care about race. Usually this is
2208                  * in error reporting path
2209                  */
2210                 buf = fsname_static;
2211                 buflen = sizeof(fsname_static);
2212         }
2213
2214         len = strlen(lsi->lsi_lmd->lmd_profile);
2215         ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-');
2216         if (ptr && (strcmp(ptr, "-client") == 0))
2217                 len -= 7;
2218
2219         if (unlikely(len >= buflen))
2220                 len = buflen - 1;
2221         strncpy(buf, lsi->lsi_lmd->lmd_profile, len);
2222         buf[len] = '\0';
2223
2224         return buf;
2225 }
2226
2227 void ll_dirty_page_discard_warn(struct page *page, int ioret)
2228 {
2229         char *buf, *path = NULL;
2230         struct dentry *dentry = NULL;
2231         struct vvp_object *obj = cl_inode2vvp(page->mapping->host);
2232
2233         /* this can be called inside spin lock so use GFP_ATOMIC. */
2234         buf = (char *)__get_free_page(GFP_ATOMIC);
2235         if (buf) {
2236                 dentry = d_find_alias(page->mapping->host);
2237                 if (dentry)
2238                         path = dentry_path_raw(dentry, buf, PAGE_SIZE);
2239         }
2240
2241         CDEBUG(D_WARNING,
2242                "%s: dirty page discard: %s/fid: " DFID "/%s may get corrupted (rc %d)\n",
2243                ll_get_fsname(page->mapping->host->i_sb, NULL, 0),
2244                s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev,
2245                PFID(&obj->vob_header.coh_lu.loh_fid),
2246                (path && !IS_ERR(path)) ? path : "", ioret);
2247
2248         if (dentry)
2249                 dput(dentry);
2250
2251         if (buf)
2252                 free_page((unsigned long)buf);
2253 }