ceph: auto reconnect after blacklisted

author Yan, Zheng <zyan@redhat.com>

Thu, 25 Jul 2019 12:16:47 +0000 (20:16 +0800)

committer Ilya Dryomov <idryomov@gmail.com>

Mon, 16 Sep 2019 10:06:24 +0000 (12:06 +0200)
author Yan, Zheng <zyan@redhat.com>
Thu, 25 Jul 2019 12:16:47 +0000 (20:16 +0800)
committer Ilya Dryomov <idryomov@gmail.com>
Mon, 16 Sep 2019 10:06:24 +0000 (12:06 +0200)
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt

index d2c6a5c..b19b6a0 100644 (file)
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -158,6 +158,20 @@ Mount Options
          copies.  Currently, it's only used in copy_file_range, which will revert
          to the default VFS implementation if this option is used.
  
+  recover_session=<no|clean>
+       Set auto reconnect mode in the case where the client is blacklisted. The
+       available modes are "no" and "clean". The default is "no".
+
+       * no: never attempt to reconnect when client detects that it has been
+       blacklisted. Operations will generally fail after being blacklisted.
+
+       * clean: client reconnects to the ceph cluster automatically when it
+       detects that it has been blacklisted. During reconnect, client drops
+       dirty data/metadata, invalidates page caches and writable file handles.
+       After reconnect, file locks become stale because the MDS loses track
+       of them. If an inode contains any stale file locks, read/write on the
+       inode is not allowed until applications release all stale file locks.
+
  More Information
  ================
  
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c

index 2d6e23e..6260228 100644 (file)
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
  {
         struct inode *inode = file_inode(filp);
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_osd_client *osdc =
-               &ceph_inode_to_client(inode)->client->osdc;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
         int err = 0;
         u64 off = page_offset(page);
         u64 len = PAGE_SIZE;
@@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
  
         dout("readpage inode %p file %p page %p index %lu\n",
              inode, filp, page, page->index);
-       err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
-                                 off, &len,
+       err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
+                                 &ci->i_layout, off, &len,
                                   ci->i_truncate_seq, ci->i_truncate_size,
                                   &page, 1, 0);
         if (err == -ENOENT)
@@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
         if (err < 0) {
                 SetPageError(page);
                 ceph_fscache_readpage_cancel(inode, page);
+               if (err == -EBLACKLISTED)
+                       fsc->blacklisted = true;
                 goto out;
         }
         if (err < PAGE_SIZE)
@@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req)
         int i;
  
         dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
+       if (rc == -EBLACKLISTED)
+               ceph_inode_to_client(inode)->blacklisted = true;
  
         /* unlock all pages, zeroing any data we didn't read */
         osd_data = osd_req_op_extent_osd_data(req, 0);
@@ -641,6 +644,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
                         end_page_writeback(page);
                         return err;
                 }
+               if (err == -EBLACKLISTED)
+                       fsc->blacklisted = true;
                 dout("writepage setting page/mapping error %d %p\n",
                      err, page);
                 SetPageError(page);
@@ -721,6 +726,8 @@ static void writepages_finish(struct ceph_osd_request *req)
         if (rc < 0) {
                 mapping_set_error(mapping, rc);
                 ceph_set_error_write(ci);
+               if (rc == -EBLACKLISTED)
+                       fsc->blacklisted = true;
         } else {
                 ceph_clear_error_write(ci);
         }
@@ -1948,12 +1955,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
  
         if (err >= 0 || err == -ENOENT)
                 have |= POOL_READ;
-       else if (err != -EPERM)
+       else if (err != -EPERM) {
+               if (err == -EBLACKLISTED)
+                       fsc->blacklisted = true;
                 goto out_unlock;
+       }
  
         if (err2 == 0 || err2 == -EEXIST)
                 have |= POOL_WRITE;
         else if (err2 != -EPERM) {
+               if (err2 == -EBLACKLISTED)
+                       fsc->blacklisted = true;
                 err = err2;
                 goto out_unlock;
         }
diff --git a/fs/ceph/file.c b/fs/ceph/file.c

index 779bf68..5182e1a 100644 (file)
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -698,7 +698,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
                         ceph_release_page_vector(pages, num_pages);
                 }
  
-               if (ret <= 0 || off >= i_size || !more)
+               if (ret < 0) {
+                       if (ret == -EBLACKLISTED)
+                               fsc->blacklisted = true;
+                       break;
+               }
+
+               if (off >= i_size || !more)
                         break;
         }
  
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

index ed4d20a..5bfbff8 100644 (file)
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3032,18 +3032,23 @@ bad:
         pr_err("mdsc_handle_forward decode error err=%d\n", err);
  }
  
-static int __decode_and_drop_session_metadata(void **p, void *end)
+static int __decode_session_metadata(void **p, void *end,
+                                    bool *blacklisted)
  {
         /* map<string,string> */
         u32 n;
+       bool err_str;
         ceph_decode_32_safe(p, end, n, bad);
         while (n-- > 0) {
                 u32 len;
                 ceph_decode_32_safe(p, end, len, bad);
                 ceph_decode_need(p, end, len, bad);
+               err_str = !strncmp(*p, "error_string", len);
                 *p += len;
                 ceph_decode_32_safe(p, end, len, bad);
                 ceph_decode_need(p, end, len, bad);
+               if (err_str && strnstr(*p, "blacklisted", len))
+                       *blacklisted = true;
                 *p += len;
         }
         return 0;
@@ -3067,6 +3072,7 @@ static void handle_session(struct ceph_mds_session *session,
         u64 seq;
         unsigned long features = 0;
         int wake = 0;
+       bool blacklisted = false;
  
         /* decode */
         ceph_decode_need(&p, end, sizeof(*h), bad);
@@ -3079,7 +3085,7 @@ static void handle_session(struct ceph_mds_session *session,
         if (msg_version >= 3) {
                 u32 len;
                 /* version >= 2, metadata */
-               if (__decode_and_drop_session_metadata(&p, end) < 0)
+               if (__decode_session_metadata(&p, end, &blacklisted) < 0)
                         goto bad;
                 /* version >= 3, feature bits */
                 ceph_decode_32_safe(&p, end, len, bad);
@@ -3166,6 +3172,8 @@ static void handle_session(struct ceph_mds_session *session,
                 session->s_state = CEPH_MDS_SESSION_REJECTED;
                 cleanup_session_requests(mdsc, session);
                 remove_session_caps(session);
+               if (blacklisted)
+                       mdsc->fsc->blacklisted = true;
                 wake = 2; /* for good measure */
                 break;
  
@@ -4015,7 +4023,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
         mutex_unlock(&mdsc->mutex);
  }
  
+static void maybe_recover_session(struct ceph_mds_client *mdsc)
+{
+       struct ceph_fs_client *fsc = mdsc->fsc;
+
+       if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
+               return;
+
+       if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
+               return;
  
+       if (!READ_ONCE(fsc->blacklisted))
+               return;
+
+       if (fsc->last_auto_reconnect &&
+           time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
+               return;
+
+       pr_info("auto reconnect after blacklisted\n");
+       fsc->last_auto_reconnect = jiffies;
+       ceph_force_reconnect(fsc->sb);
+}
  
  /*
   * delayed work -- periodically trim expired leases, renew caps with mds
@@ -4089,6 +4117,8 @@ static void delayed_work(struct work_struct *work)
  
         ceph_trim_snapid_map(mdsc);
  
+       maybe_recover_session(mdsc);
+
         schedule_delayed(mdsc);
  }
  
diff --git a/fs/ceph/super.c b/fs/ceph/super.c

index 630549a..03b63b1 100644 (file)
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -143,6 +143,7 @@ enum {
         Opt_snapdirname,
         Opt_mds_namespace,
         Opt_fscache_uniq,
+       Opt_recover_session,
         Opt_last_string,
         /* string args above */
         Opt_dirstat,
@@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = {
         /* int args above */
         {Opt_snapdirname, "snapdirname=%s"},
         {Opt_mds_namespace, "mds_namespace=%s"},
+       {Opt_recover_session, "recover_session=%s"},
         {Opt_fscache_uniq, "fsc=%s"},
         /* string args above */
         {Opt_dirstat, "dirstat"},
@@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private)
                 if (!fsopt->mds_namespace)
                         return -ENOMEM;
                 break;
+       case Opt_recover_session:
+               if (!strncmp(argstr[0].from, "no",
+                            argstr[0].to - argstr[0].from)) {
+                       fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER;
+               } else if (!strncmp(argstr[0].from, "clean",
+                                   argstr[0].to - argstr[0].from)) {
+                       fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER;
+               } else {
+                       return -EINVAL;
+               }
+               break;
         case Opt_fscache_uniq:
                 kfree(fsopt->fscache_uniq);
                 fsopt->fscache_uniq = kstrndup(argstr[0].from,
@@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
  
         if (fsopt->mds_namespace)
                 seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
+
+       if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
+               seq_show_option(m, "recover_session", "clean");
+
         if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
                 seq_printf(m, ",wsize=%d", fsopt->wsize);
         if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -1169,6 +1186,8 @@ int ceph_force_reconnect(struct super_block *sb)
         ceph_reset_client_addr(fsc->client);
  
         ceph_osdc_clear_abort_err(&fsc->client->osdc);
+
+       fsc->blacklisted = false;
         fsc->mount_state = CEPH_MOUNT_MOUNTED;
  
         if (sb->s_root) {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h

index f5e5f6a..2105c2c 100644 (file)
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -31,6 +31,7 @@
  #define CEPH_BLOCK_SHIFT   22  /* 4 MB */
  #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
  
+#define CEPH_MOUNT_OPT_CLEANRECOVER    (1<<1) /* auto reonnect (clean mode) after blacklisted */
  #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
  #define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
  #define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
@@ -102,6 +103,9 @@ struct ceph_fs_client {
  
         unsigned long mount_state;
  
+       unsigned long last_auto_reconnect;
+       bool blacklisted;
+
         u32 filp_gen;
         loff_t max_file_size;
author	Yan, Zheng <zyan@redhat.com>
	Thu, 25 Jul 2019 12:16:47 +0000 (20:16 +0800)
committer	Ilya Dryomov <idryomov@gmail.com>
	Mon, 16 Sep 2019 10:06:24 +0000 (12:06 +0200)
Documentation/filesystems/ceph.txt		patch \| blob \| history
fs/ceph/addr.c		patch \| blob \| history
fs/ceph/file.c		patch \| blob \| history
fs/ceph/mds_client.c		patch \| blob \| history
fs/ceph/super.c		patch \| blob \| history
fs/ceph/super.h		patch \| blob \| history