Merge tag 'ceph-for-5.5-rc2' of git://github.com/ceph/ceph-client
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Dec 2019 18:56:37 +0000 (10:56 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Dec 2019 18:56:37 +0000 (10:56 -0800)
Pull ceph fixes from Ilya Dryomov:
 "A fix to avoid a corner case when scheduling cap reclaim in batches
  from Xiubo, a patch to add some observability into cap waiters from
  Jeff and a couple of cleanups"

* tag 'ceph-for-5.5-rc2' of git://github.com/ceph/ceph-client:
  ceph: add more debug info when decoding mdsmap
  ceph: switch to global cap helper
  ceph: trigger the reclaim work once there has enough pending caps
  ceph: show tasks waiting on caps in debugfs caps file
  ceph: convert int fields in ceph_mount_options to unsigned int

fs/ceph/caps.c
fs/ceph/debugfs.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/mdsmap.c
fs/ceph/super.c
fs/ceph/super.h

index f5a3891..9d09bb5 100644 (file)
@@ -1011,18 +1011,13 @@ static int __ceph_is_single_caps(struct ceph_inode_info *ci)
        return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
 }
 
-static int __ceph_is_any_caps(struct ceph_inode_info *ci)
-{
-       return !RB_EMPTY_ROOT(&ci->i_caps);
-}
-
 int ceph_is_any_caps(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int ret;
 
        spin_lock(&ci->i_ceph_lock);
-       ret = __ceph_is_any_caps(ci);
+       ret = __ceph_is_any_real_caps(ci);
        spin_unlock(&ci->i_ceph_lock);
 
        return ret;
@@ -1099,15 +1094,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
        if (removed)
                ceph_put_cap(mdsc, cap);
 
-       /* when reconnect denied, we remove session caps forcibly,
-        * i_wr_ref can be non-zero. If there are ongoing write,
-        * keep i_snap_realm.
-        */
-       if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
-               drop_inode_snap_realm(ci);
+       if (!__ceph_is_any_real_caps(ci)) {
+               /* when reconnect denied, we remove session caps forcibly,
+                * i_wr_ref can be non-zero. If there are ongoing write,
+                * keep i_snap_realm.
+                */
+               if (ci->i_wr_ref == 0 && ci->i_snap_realm)
+                       drop_inode_snap_realm(ci);
 
-       if (!__ceph_is_any_real_caps(ci))
                __cap_delay_cancel(mdsc, ci);
+       }
 }
 
 struct cap_msg_args {
@@ -2764,7 +2760,19 @@ int ceph_get_caps(struct file *filp, int need, int want,
                if (ret == -EAGAIN)
                        continue;
                if (!ret) {
+                       struct ceph_mds_client *mdsc = fsc->mdsc;
+                       struct cap_wait cw;
                        DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+                       cw.ino = inode->i_ino;
+                       cw.tgid = current->tgid;
+                       cw.need = need;
+                       cw.want = want;
+
+                       spin_lock(&mdsc->caps_list_lock);
+                       list_add(&cw.list, &mdsc->cap_wait_list);
+                       spin_unlock(&mdsc->caps_list_lock);
+
                        add_wait_queue(&ci->i_cap_wq, &wait);
 
                        flags |= NON_BLOCKING;
@@ -2778,6 +2786,11 @@ int ceph_get_caps(struct file *filp, int need, int want,
                        }
 
                        remove_wait_queue(&ci->i_cap_wq, &wait);
+
+                       spin_lock(&mdsc->caps_list_lock);
+                       list_del(&cw.list);
+                       spin_unlock(&mdsc->caps_list_lock);
+
                        if (ret == -EAGAIN)
                                continue;
                }
@@ -2928,7 +2941,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
                                ci->i_head_snapc = NULL;
                        }
                        /* see comment in __ceph_remove_cap() */
-                       if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
+                       if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
                                drop_inode_snap_realm(ci);
                }
        spin_unlock(&ci->i_ceph_lock);
index facb387..c281f32 100644 (file)
@@ -139,6 +139,7 @@ static int caps_show(struct seq_file *s, void *p)
        struct ceph_fs_client *fsc = s->private;
        struct ceph_mds_client *mdsc = fsc->mdsc;
        int total, avail, used, reserved, min, i;
+       struct cap_wait *cw;
 
        ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
        seq_printf(s, "total\t\t%d\n"
@@ -166,6 +167,18 @@ static int caps_show(struct seq_file *s, void *p)
        }
        mutex_unlock(&mdsc->mutex);
 
+       seq_printf(s, "\n\nWaiters:\n--------\n");
+       seq_printf(s, "tgid         ino                need             want\n");
+       seq_printf(s, "-----------------------------------------------------\n");
+
+       spin_lock(&mdsc->caps_list_lock);
+       list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
+               seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+                               ceph_cap_string(cw->need),
+                               ceph_cap_string(cw->want));
+       }
+       spin_unlock(&mdsc->caps_list_lock);
+
        return 0;
 }
 
index 068b029..374db1b 100644 (file)
@@ -2015,7 +2015,7 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
        if (!nr)
                return;
        val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
-       if (!(val % CEPH_CAPS_PER_RELEASE)) {
+       if ((val % CEPH_CAPS_PER_RELEASE) < nr) {
                atomic_set(&mdsc->cap_reclaim_pending, 0);
                ceph_queue_cap_reclaim_work(mdsc);
        }
@@ -2032,12 +2032,13 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
        size_t size = sizeof(struct ceph_mds_reply_dir_entry);
-       int order, num_entries;
+       unsigned int num_entries;
+       int order;
 
        spin_lock(&ci->i_ceph_lock);
        num_entries = ci->i_files + ci->i_subdirs;
        spin_unlock(&ci->i_ceph_lock);
-       num_entries = max(num_entries, 1);
+       num_entries = max(num_entries, 1U);
        num_entries = min(num_entries, opt->max_readdir);
 
        order = get_order(size * num_entries);
@@ -4168,6 +4169,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
        INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
        mdsc->last_renew_caps = jiffies;
        INIT_LIST_HEAD(&mdsc->cap_delay_list);
+       INIT_LIST_HEAD(&mdsc->cap_wait_list);
        spin_lock_init(&mdsc->cap_delay_lock);
        INIT_LIST_HEAD(&mdsc->snap_flush_list);
        spin_lock_init(&mdsc->snap_flush_lock);
index 5cd131b..14c7e8c 100644 (file)
@@ -340,6 +340,14 @@ struct ceph_quotarealm_inode {
        struct inode *inode;
 };
 
+struct cap_wait {
+       struct list_head        list;
+       unsigned long           ino;
+       pid_t                   tgid;
+       int                     need;
+       int                     want;
+};
+
 /*
  * mds client state
  */
@@ -416,6 +424,7 @@ struct ceph_mds_client {
        spinlock_t      caps_list_lock;
        struct          list_head caps_list; /* unused (reserved or
                                                unreserved) */
+       struct          list_head cap_wait_list;
        int             caps_total_count;    /* total caps allocated */
        int             caps_use_count;      /* in use */
        int             caps_use_max;        /* max used caps */
index aeec1d6..471bac3 100644 (file)
@@ -158,6 +158,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                void *pexport_targets = NULL;
                struct ceph_timespec laggy_since;
                struct ceph_mds_info *info;
+               bool laggy;
 
                ceph_decode_need(p, end, sizeof(u64) + 1, bad);
                global_id = ceph_decode_64(p);
@@ -190,6 +191,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                if (err)
                        goto corrupt;
                ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+               laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
                *p += sizeof(u32);
                ceph_decode_32_safe(p, end, namelen, bad);
                *p += namelen;
@@ -207,10 +209,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                        *p = info_end;
                }
 
-               dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+               dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
                     i+1, n, global_id, mds, inc,
                     ceph_pr_addr(&addr),
-                    ceph_mds_state_name(state));
+                    ceph_mds_state_name(state),
+                    laggy ? "(laggy)" : "");
 
                if (mds < 0 || state <= 0)
                        continue;
@@ -230,8 +233,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                info->global_id = global_id;
                info->state = state;
                info->addr = addr;
-               info->laggy = (laggy_since.tv_sec != 0 ||
-                              laggy_since.tv_nsec != 0);
+               info->laggy = laggy;
                info->num_export_targets = num_export_targets;
                if (num_export_targets) {
                        info->export_targets = kcalloc(num_export_targets,
@@ -355,6 +357,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                m->m_damaged = false;
        }
 bad_ext:
+       dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
+            !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
        *p = end;
        dout("mdsmap_decode success epoch %u\n", m->m_epoch);
        return m;
index 9c9a7c6..29a795f 100644 (file)
@@ -172,10 +172,10 @@ static const struct fs_parameter_enum ceph_mount_param_enums[] = {
 static const struct fs_parameter_spec ceph_mount_param_specs[] = {
        fsparam_flag_no ("acl",                         Opt_acl),
        fsparam_flag_no ("asyncreaddir",                Opt_asyncreaddir),
-       fsparam_u32     ("caps_max",                    Opt_caps_max),
+       fsparam_s32     ("caps_max",                    Opt_caps_max),
        fsparam_u32     ("caps_wanted_delay_max",       Opt_caps_wanted_delay_max),
        fsparam_u32     ("caps_wanted_delay_min",       Opt_caps_wanted_delay_min),
-       fsparam_s32     ("write_congestion_kb",         Opt_congestion_kb),
+       fsparam_u32     ("write_congestion_kb",         Opt_congestion_kb),
        fsparam_flag_no ("copyfrom",                    Opt_copyfrom),
        fsparam_flag_no ("dcache",                      Opt_dcache),
        fsparam_flag_no ("dirstat",                     Opt_dirstat),
@@ -187,8 +187,8 @@ static const struct fs_parameter_spec ceph_mount_param_specs[] = {
        fsparam_flag_no ("quotadf",                     Opt_quotadf),
        fsparam_u32     ("rasize",                      Opt_rasize),
        fsparam_flag_no ("rbytes",                      Opt_rbytes),
-       fsparam_s32     ("readdir_max_bytes",           Opt_readdir_max_bytes),
-       fsparam_s32     ("readdir_max_entries",         Opt_readdir_max_entries),
+       fsparam_u32     ("readdir_max_bytes",           Opt_readdir_max_bytes),
+       fsparam_u32     ("readdir_max_entries",         Opt_readdir_max_entries),
        fsparam_enum    ("recover_session",             Opt_recover_session),
        fsparam_flag_no ("require_active_mds",          Opt_require_active_mds),
        fsparam_u32     ("rsize",                       Opt_rsize),
@@ -328,7 +328,9 @@ static int ceph_parse_mount_param(struct fs_context *fc,
                fsopt->caps_wanted_delay_max = result.uint_32;
                break;
        case Opt_caps_max:
-               fsopt->caps_max = result.uint_32;
+               if (result.int_32 < 0)
+                       goto out_of_range;
+               fsopt->caps_max = result.int_32;
                break;
        case Opt_readdir_max_entries:
                if (result.uint_32 < 1)
@@ -547,25 +549,25 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_show_option(m, "recover_session", "clean");
 
        if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
-               seq_printf(m, ",wsize=%d", fsopt->wsize);
+               seq_printf(m, ",wsize=%u", fsopt->wsize);
        if (fsopt->rsize != CEPH_MAX_READ_SIZE)
-               seq_printf(m, ",rsize=%d", fsopt->rsize);
+               seq_printf(m, ",rsize=%u", fsopt->rsize);
        if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
-               seq_printf(m, ",rasize=%d", fsopt->rasize);
+               seq_printf(m, ",rasize=%u", fsopt->rasize);
        if (fsopt->congestion_kb != default_congestion_kb())
-               seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+               seq_printf(m, ",write_congestion_kb=%u", fsopt->congestion_kb);
        if (fsopt->caps_max)
                seq_printf(m, ",caps_max=%d", fsopt->caps_max);
        if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
-               seq_printf(m, ",caps_wanted_delay_min=%d",
+               seq_printf(m, ",caps_wanted_delay_min=%u",
                         fsopt->caps_wanted_delay_min);
        if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
-               seq_printf(m, ",caps_wanted_delay_max=%d",
+               seq_printf(m, ",caps_wanted_delay_max=%u",
                           fsopt->caps_wanted_delay_max);
        if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
-               seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+               seq_printf(m, ",readdir_max_entries=%u", fsopt->max_readdir);
        if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
-               seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+               seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes);
        if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
                seq_show_option(m, "snapdirname", fsopt->snapdir_name);
 
index f0f9cb7..3bf1a01 100644 (file)
 #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
 
 struct ceph_mount_options {
-       int flags;
+       unsigned int flags;
 
-       int wsize;            /* max write size */
-       int rsize;            /* max read size */
-       int rasize;           /* max readahead */
-       int congestion_kb;    /* max writeback in flight */
-       int caps_wanted_delay_min, caps_wanted_delay_max;
+       unsigned int wsize;            /* max write size */
+       unsigned int rsize;            /* max read size */
+       unsigned int rasize;           /* max readahead */
+       unsigned int congestion_kb;    /* max writeback in flight */
+       unsigned int caps_wanted_delay_min, caps_wanted_delay_max;
        int caps_max;
-       int max_readdir;       /* max readdir result (entires) */
-       int max_readdir_bytes; /* max readdir result (bytes) */
+       unsigned int max_readdir;       /* max readdir result (entries) */
+       unsigned int max_readdir_bytes; /* max readdir result (bytes) */
 
        /*
         * everything above this point can be memcmp'd; everything below