fs/ocfs2/dlmfs/dlmfs.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * dlmfs.c
   4  *
   5  * Code which implements the kernel side of a minimal userspace
   6  * interface to our DLM. This file handles the virtual file system
   7  * used for communication with userspace. Credit should go to ramfs,
   8  * which was a template for the fs side of this module.
   9  *
  10  * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
  11  */
  12
  13 /* Simple VFS hooks based on: */
  14 /*
  15  * Resizable simple ram filesystem for Linux.
  16  *
  17  * Copyright (C) 2000 Linus Torvalds.
  18  *               2000 Transmeta Corp.
  19  */
  20
  21 #include <linux/module.h>
  22 #include <linux/fs.h>
  23 #include <linux/pagemap.h>
  24 #include <linux/types.h>
  25 #include <linux/slab.h>
  26 #include <linux/highmem.h>
  27 #include <linux/init.h>
  28 #include <linux/string.h>
  29 #include <linux/backing-dev.h>
  30 #include <linux/poll.h>
  31
  32 #include <linux/uaccess.h>
  33
  34 #include "../stackglue.h"
  35 #include "userdlm.h"
  36
  37 #define MLOG_MASK_PREFIX ML_DLMFS
  38 #include "../cluster/masklog.h"
  39
  40
  41 static const struct super_operations dlmfs_ops;
  42 static const struct file_operations dlmfs_file_operations;
  43 static const struct inode_operations dlmfs_dir_inode_operations;
  44 static const struct inode_operations dlmfs_root_inode_operations;
  45 static const struct inode_operations dlmfs_file_inode_operations;
  46 static struct kmem_cache *dlmfs_inode_cache;
  47
  48 struct workqueue_struct *user_dlm_worker;
  49
  50
  51
  52 /*
  53  * These are the ABI capabilities of dlmfs.
  54  *
  55  * Over time, dlmfs has added some features that were not part of the
  56  * initial ABI.  Unfortunately, some of these features are not detectable
  57  * via standard usage.  For example, Linux's default poll always returns
  58  * EPOLLIN, so there is no way for a caller of poll(2) to know when dlmfs
  59  * added poll support.  Instead, we provide this list of new capabilities.
  60  *
  61  * Capabilities is a read-only attribute.  We do it as a module parameter
  62  * so we can discover it whether dlmfs is built in, loaded, or even not
  63  * loaded.
  64  *
  65  * The ABI features are local to this machine's dlmfs mount.  This is
  66  * distinct from the locking protocol, which is concerned with inter-node
  67  * interaction.
  68  *
  69  * Capabilities:
  70  * - bast       : EPOLLIN against the file descriptor of a held lock
  71  *                signifies a bast fired on the lock.
  72  */
  73 #define DLMFS_CAPABILITIES "bast stackglue"
  74 static int param_set_dlmfs_capabilities(const char *val,
  75                                         const struct kernel_param *kp)
  76 {
  77         printk(KERN_ERR "%s: readonly parameter\n", kp->name);
  78         return -EINVAL;
  79 }
  80 static int param_get_dlmfs_capabilities(char *buffer,
  81                                         const struct kernel_param *kp)
  82 {
  83         return strlcpy(buffer, DLMFS_CAPABILITIES,
  84                        strlen(DLMFS_CAPABILITIES) + 1);
  85 }
  86 module_param_call(capabilities, param_set_dlmfs_capabilities,
  87                   param_get_dlmfs_capabilities, NULL, 0444);
  88 MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
  89
  90
  91 /*
  92  * decodes a set of open flags into a valid lock level and a set of flags.
  93  * returns < 0 if we have invalid flags
  94  * flags which mean something to us:
  95  * O_RDONLY -> PRMODE level
  96  * O_WRONLY -> EXMODE level
  97  *
  98  * O_NONBLOCK -> NOQUEUE
  99  */
 100 static int dlmfs_decode_open_flags(int open_flags,
 101                                    int *level,
 102                                    int *flags)
 103 {
 104         if (open_flags & (O_WRONLY|O_RDWR))
 105                 *level = DLM_LOCK_EX;
 106         else
 107                 *level = DLM_LOCK_PR;
 108
 109         *flags = 0;
 110         if (open_flags & O_NONBLOCK)
 111                 *flags |= DLM_LKF_NOQUEUE;
 112
 113         return 0;
 114 }
 115
 116 static int dlmfs_file_open(struct inode *inode,
 117                            struct file *file)
 118 {
 119         int status, level, flags;
 120         struct dlmfs_filp_private *fp = NULL;
 121         struct dlmfs_inode_private *ip;
 122
 123         if (S_ISDIR(inode->i_mode))
 124                 BUG();
 125
 126         mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
 127                 file->f_flags);
 128
 129         status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
 130         if (status < 0)
 131                 goto bail;
 132
 133         /* We don't want to honor O_APPEND at read/write time as it
 134          * doesn't make sense for LVB writes. */
 135         file->f_flags &= ~O_APPEND;
 136
 137         fp = kmalloc(sizeof(*fp), GFP_NOFS);
 138         if (!fp) {
 139                 status = -ENOMEM;
 140                 goto bail;
 141         }
 142         fp->fp_lock_level = level;
 143
 144         ip = DLMFS_I(inode);
 145
 146         status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
 147         if (status < 0) {
 148                 /* this is a strange error to return here but I want
 149                  * to be able userspace to be able to distinguish a
 150                  * valid lock request from one that simply couldn't be
 151                  * granted. */
 152                 if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
 153                         status = -ETXTBSY;
 154                 kfree(fp);
 155                 goto bail;
 156         }
 157
 158         file->private_data = fp;
 159 bail:
 160         return status;
 161 }
 162
 163 static int dlmfs_file_release(struct inode *inode,
 164                               struct file *file)
 165 {
 166         int level;
 167         struct dlmfs_inode_private *ip = DLMFS_I(inode);
 168         struct dlmfs_filp_private *fp = file->private_data;
 169
 170         if (S_ISDIR(inode->i_mode))
 171                 BUG();
 172
 173         mlog(0, "close called on inode %lu\n", inode->i_ino);
 174
 175         if (fp) {
 176                 level = fp->fp_lock_level;
 177                 if (level != DLM_LOCK_IV)
 178                         user_dlm_cluster_unlock(&ip->ip_lockres, level);
 179
 180                 kfree(fp);
 181                 file->private_data = NULL;
 182         }
 183
 184         return 0;
 185 }
 186
 187 /*
 188  * We do ->setattr() just to override size changes.  Our size is the size
 189  * of the LVB and nothing else.
 190  */
 191 static int dlmfs_file_setattr(struct mnt_idmap *idmap,
 192                               struct dentry *dentry, struct iattr *attr)
 193 {
 194         int error;
 195         struct inode *inode = d_inode(dentry);
 196
 197         attr->ia_valid &= ~ATTR_SIZE;
 198         error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
 199         if (error)
 200                 return error;
 201
 202         setattr_copy(&nop_mnt_idmap, inode, attr);
 203         mark_inode_dirty(inode);
 204         return 0;
 205 }
 206
 207 static __poll_t dlmfs_file_poll(struct file *file, poll_table *wait)
 208 {
 209         __poll_t event = 0;
 210         struct inode *inode = file_inode(file);
 211         struct dlmfs_inode_private *ip = DLMFS_I(inode);
 212
 213         poll_wait(file, &ip->ip_lockres.l_event, wait);
 214
 215         spin_lock(&ip->ip_lockres.l_lock);
 216         if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
 217                 event = EPOLLIN | EPOLLRDNORM;
 218         spin_unlock(&ip->ip_lockres.l_lock);
 219
 220         return event;
 221 }
 222
 223 static ssize_t dlmfs_file_read(struct file *file,
 224                                char __user *buf,
 225                                size_t count,
 226                                loff_t *ppos)
 227 {
 228         char lvb[DLM_LVB_LEN];
 229
 230         if (!user_dlm_read_lvb(file_inode(file), lvb))
 231                 return 0;
 232
 233         return simple_read_from_buffer(buf, count, ppos, lvb, sizeof(lvb));
 234 }
 235
 236 static ssize_t dlmfs_file_write(struct file *filp,
 237                                 const char __user *buf,
 238                                 size_t count,
 239                                 loff_t *ppos)
 240 {
 241         char lvb_buf[DLM_LVB_LEN];
 242         int bytes_left;
 243         struct inode *inode = file_inode(filp);
 244
 245         mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
 246                 inode->i_ino, count, *ppos);
 247
 248         if (*ppos >= DLM_LVB_LEN)
 249                 return -ENOSPC;
 250
 251         /* don't write past the lvb */
 252         if (count > DLM_LVB_LEN - *ppos)
 253                 count = DLM_LVB_LEN - *ppos;
 254
 255         if (!count)
 256                 return 0;
 257
 258         bytes_left = copy_from_user(lvb_buf, buf, count);
 259         count -= bytes_left;
 260         if (count)
 261                 user_dlm_write_lvb(inode, lvb_buf, count);
 262
 263         *ppos = *ppos + count;
 264         mlog(0, "wrote %zu bytes\n", count);
 265         return count;
 266 }
 267
 268 static void dlmfs_init_once(void *foo)
 269 {
 270         struct dlmfs_inode_private *ip =
 271                 (struct dlmfs_inode_private *) foo;
 272
 273         ip->ip_conn = NULL;
 274         ip->ip_parent = NULL;
 275
 276         inode_init_once(&ip->ip_vfs_inode);
 277 }
 278
 279 static struct inode *dlmfs_alloc_inode(struct super_block *sb)
 280 {
 281         struct dlmfs_inode_private *ip;
 282
 283         ip = alloc_inode_sb(sb, dlmfs_inode_cache, GFP_NOFS);
 284         if (!ip)
 285                 return NULL;
 286
 287         return &ip->ip_vfs_inode;
 288 }
 289
 290 static void dlmfs_free_inode(struct inode *inode)
 291 {
 292         kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
 293 }
 294
 295 static void dlmfs_evict_inode(struct inode *inode)
 296 {
 297         int status;
 298         struct dlmfs_inode_private *ip;
 299         struct user_lock_res *lockres;
 300         int teardown;
 301
 302         clear_inode(inode);
 303
 304         mlog(0, "inode %lu\n", inode->i_ino);
 305
 306         ip = DLMFS_I(inode);
 307         lockres = &ip->ip_lockres;
 308
 309         if (S_ISREG(inode->i_mode)) {
 310                 spin_lock(&lockres->l_lock);
 311                 teardown = !!(lockres->l_flags & USER_LOCK_IN_TEARDOWN);
 312                 spin_unlock(&lockres->l_lock);
 313                 if (!teardown) {
 314                         status = user_dlm_destroy_lock(lockres);
 315                         if (status < 0)
 316                                 mlog_errno(status);
 317                 }
 318                 iput(ip->ip_parent);
 319                 goto clear_fields;
 320         }
 321
 322         mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
 323         /* we must be a directory. If required, lets unregister the
 324          * dlm context now. */
 325         if (ip->ip_conn)
 326                 user_dlm_unregister(ip->ip_conn);
 327 clear_fields:
 328         ip->ip_parent = NULL;
 329         ip->ip_conn = NULL;
 330 }
 331
 332 static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 333 {
 334         struct inode *inode = new_inode(sb);
 335         umode_t mode = S_IFDIR | 0755;
 336
 337         if (inode) {
 338                 inode->i_ino = get_next_ino();
 339                 inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
 340                 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 341                 inc_nlink(inode);
 342
 343                 inode->i_fop = &simple_dir_operations;
 344                 inode->i_op = &dlmfs_root_inode_operations;
 345         }
 346
 347         return inode;
 348 }
 349
 350 static struct inode *dlmfs_get_inode(struct inode *parent,
 351                                      struct dentry *dentry,
 352                                      umode_t mode)
 353 {
 354         struct super_block *sb = parent->i_sb;
 355         struct inode * inode = new_inode(sb);
 356         struct dlmfs_inode_private *ip;
 357
 358         if (!inode)
 359                 return NULL;
 360
 361         inode->i_ino = get_next_ino();
 362         inode_init_owner(&nop_mnt_idmap, inode, parent, mode);
 363         inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
 364
 365         ip = DLMFS_I(inode);
 366         ip->ip_conn = DLMFS_I(parent)->ip_conn;
 367
 368         switch (mode & S_IFMT) {
 369         default:
 370                 /* for now we don't support anything other than
 371                  * directories and regular files. */
 372                 BUG();
 373                 break;
 374         case S_IFREG:
 375                 inode->i_op = &dlmfs_file_inode_operations;
 376                 inode->i_fop = &dlmfs_file_operations;
 377
 378                 i_size_write(inode,  DLM_LVB_LEN);
 379
 380                 user_dlm_lock_res_init(&ip->ip_lockres, dentry);
 381
 382                 /* released at clear_inode time, this insures that we
 383                  * get to drop the dlm reference on each lock *before*
 384                  * we call the unregister code for releasing parent
 385                  * directories. */
 386                 ip->ip_parent = igrab(parent);
 387                 BUG_ON(!ip->ip_parent);
 388                 break;
 389         case S_IFDIR:
 390                 inode->i_op = &dlmfs_dir_inode_operations;
 391                 inode->i_fop = &simple_dir_operations;
 392
 393                 /* directory inodes start off with i_nlink ==
 394                  * 2 (for "." entry) */
 395                 inc_nlink(inode);
 396                 break;
 397         }
 398         return inode;
 399 }
 400
 401 /*
 402  * File creation. Allocate an inode, and we're done..
 403  */
 404 /* SMP-safe */
 405 static int dlmfs_mkdir(struct mnt_idmap * idmap,
 406                        struct inode * dir,
 407                        struct dentry * dentry,
 408                        umode_t mode)
 409 {
 410         int status;
 411         struct inode *inode = NULL;
 412         const struct qstr *domain = &dentry->d_name;
 413         struct dlmfs_inode_private *ip;
 414         struct ocfs2_cluster_connection *conn;
 415
 416         mlog(0, "mkdir %.*s\n", domain->len, domain->name);
 417
 418         /* verify that we have a proper domain */
 419         if (domain->len >= GROUP_NAME_MAX) {
 420                 status = -EINVAL;
 421                 mlog(ML_ERROR, "invalid domain name for directory.\n");
 422                 goto bail;
 423         }
 424
 425         inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
 426         if (!inode) {
 427                 status = -ENOMEM;
 428                 mlog_errno(status);
 429                 goto bail;
 430         }
 431
 432         ip = DLMFS_I(inode);
 433
 434         conn = user_dlm_register(domain);
 435         if (IS_ERR(conn)) {
 436                 status = PTR_ERR(conn);
 437                 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
 438                      status, domain->len, domain->name);
 439                 goto bail;
 440         }
 441         ip->ip_conn = conn;
 442
 443         inc_nlink(dir);
 444         d_instantiate(dentry, inode);
 445         dget(dentry);   /* Extra count - pin the dentry in core */
 446
 447         status = 0;
 448 bail:
 449         if (status < 0)
 450                 iput(inode);
 451         return status;
 452 }
 453
 454 static int dlmfs_create(struct mnt_idmap *idmap,
 455                         struct inode *dir,
 456                         struct dentry *dentry,
 457                         umode_t mode,
 458                         bool excl)
 459 {
 460         int status = 0;
 461         struct inode *inode;
 462         const struct qstr *name = &dentry->d_name;
 463
 464         mlog(0, "create %.*s\n", name->len, name->name);
 465
 466         /* verify name is valid and doesn't contain any dlm reserved
 467          * characters */
 468         if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
 469             name->name[0] == '$') {
 470                 status = -EINVAL;
 471                 mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
 472                      name->name);
 473                 goto bail;
 474         }
 475
 476         inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
 477         if (!inode) {
 478                 status = -ENOMEM;
 479                 mlog_errno(status);
 480                 goto bail;
 481         }
 482
 483         d_instantiate(dentry, inode);
 484         dget(dentry);   /* Extra count - pin the dentry in core */
 485 bail:
 486         return status;
 487 }
 488
 489 static int dlmfs_unlink(struct inode *dir,
 490                         struct dentry *dentry)
 491 {
 492         int status;
 493         struct inode *inode = d_inode(dentry);
 494
 495         mlog(0, "unlink inode %lu\n", inode->i_ino);
 496
 497         /* if there are no current holders, or none that are waiting
 498          * to acquire a lock, this basically destroys our lockres. */
 499         status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
 500         if (status < 0) {
 501                 mlog(ML_ERROR, "unlink %pd, error %d from destroy\n",
 502                      dentry, status);
 503                 goto bail;
 504         }
 505         status = simple_unlink(dir, dentry);
 506 bail:
 507         return status;
 508 }
 509
 510 static int dlmfs_fill_super(struct super_block * sb,
 511                             void * data,
 512                             int silent)
 513 {
 514         sb->s_maxbytes = MAX_LFS_FILESIZE;
 515         sb->s_blocksize = PAGE_SIZE;
 516         sb->s_blocksize_bits = PAGE_SHIFT;
 517         sb->s_magic = DLMFS_MAGIC;
 518         sb->s_op = &dlmfs_ops;
 519         sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
 520         if (!sb->s_root)
 521                 return -ENOMEM;
 522         return 0;
 523 }
 524
 525 static const struct file_operations dlmfs_file_operations = {
 526         .open           = dlmfs_file_open,
 527         .release        = dlmfs_file_release,
 528         .poll           = dlmfs_file_poll,
 529         .read           = dlmfs_file_read,
 530         .write          = dlmfs_file_write,
 531         .llseek         = default_llseek,
 532 };
 533
 534 static const struct inode_operations dlmfs_dir_inode_operations = {
 535         .create         = dlmfs_create,
 536         .lookup         = simple_lookup,
 537         .unlink         = dlmfs_unlink,
 538 };
 539
 540 /* this way we can restrict mkdir to only the toplevel of the fs. */
 541 static const struct inode_operations dlmfs_root_inode_operations = {
 542         .lookup         = simple_lookup,
 543         .mkdir          = dlmfs_mkdir,
 544         .rmdir          = simple_rmdir,
 545 };
 546
 547 static const struct super_operations dlmfs_ops = {
 548         .statfs         = simple_statfs,
 549         .alloc_inode    = dlmfs_alloc_inode,
 550         .free_inode     = dlmfs_free_inode,
 551         .evict_inode    = dlmfs_evict_inode,
 552         .drop_inode     = generic_delete_inode,
 553 };
 554
 555 static const struct inode_operations dlmfs_file_inode_operations = {
 556         .getattr        = simple_getattr,
 557         .setattr        = dlmfs_file_setattr,
 558 };
 559
 560 static struct dentry *dlmfs_mount(struct file_system_type *fs_type,
 561         int flags, const char *dev_name, void *data)
 562 {
 563         return mount_nodev(fs_type, flags, data, dlmfs_fill_super);
 564 }
 565
 566 static struct file_system_type dlmfs_fs_type = {
 567         .owner          = THIS_MODULE,
 568         .name           = "ocfs2_dlmfs",
 569         .mount          = dlmfs_mount,
 570         .kill_sb        = kill_litter_super,
 571 };
 572 MODULE_ALIAS_FS("ocfs2_dlmfs");
 573
 574 static int __init init_dlmfs_fs(void)
 575 {
 576         int status;
 577         int cleanup_inode = 0, cleanup_worker = 0;
 578
 579         dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
 580                                 sizeof(struct dlmfs_inode_private),
 581                                 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 582                                         SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 583                                 dlmfs_init_once);
 584         if (!dlmfs_inode_cache) {
 585                 status = -ENOMEM;
 586                 goto bail;
 587         }
 588         cleanup_inode = 1;
 589
 590         user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0);
 591         if (!user_dlm_worker) {
 592                 status = -ENOMEM;
 593                 goto bail;
 594         }
 595         cleanup_worker = 1;
 596
 597         user_dlm_set_locking_protocol();
 598         status = register_filesystem(&dlmfs_fs_type);
 599 bail:
 600         if (status) {
 601                 if (cleanup_inode)
 602                         kmem_cache_destroy(dlmfs_inode_cache);
 603                 if (cleanup_worker)
 604                         destroy_workqueue(user_dlm_worker);
 605         } else
 606                 printk("OCFS2 User DLM kernel interface loaded\n");
 607         return status;
 608 }
 609
 610 static void __exit exit_dlmfs_fs(void)
 611 {
 612         unregister_filesystem(&dlmfs_fs_type);
 613
 614         destroy_workqueue(user_dlm_worker);
 615
 616         /*
 617          * Make sure all delayed rcu free inodes are flushed before we
 618          * destroy cache.
 619          */
 620         rcu_barrier();
 621         kmem_cache_destroy(dlmfs_inode_cache);
 622
 623 }
 624
 625 MODULE_AUTHOR("Oracle");
 626 MODULE_LICENSE("GPL");
 627 MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
 628
 629 module_init(init_dlmfs_fs)
 630 module_exit(exit_dlmfs_fs)