copies. Currently, it's only used in copy_file_range, which will revert
to the default VFS implementation if this option is used.
+ recover_session=<no|clean>
+ Set auto reconnect mode in the case where the client is blacklisted. The
+ available modes are "no" and "clean". The default is "no".
+
+ * no: never attempt to reconnect when client detects that it has been
+ blacklisted. Operations will generally fail after being blacklisted.
+
+ * clean: client reconnects to the ceph cluster automatically when it
+ detects that it has been blacklisted. During reconnect, client drops
+ dirty data/metadata, invalidates page caches and writable file handles.
+ After reconnect, file locks become stale because the MDS loses track
+ of them. If an inode contains any stale file locks, read/write on the
+ inode is not allowed until applications release all stale file locks.
+
More Information
================
{
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc =
- &ceph_inode_to_client(inode)->client->osdc;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
int err = 0;
u64 off = page_offset(page);
u64 len = PAGE_SIZE;
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
- err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
- off, &len,
+ err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
+ &ci->i_layout, off, &len,
ci->i_truncate_seq, ci->i_truncate_size,
&page, 1, 0);
if (err == -ENOENT)
if (err < 0) {
SetPageError(page);
ceph_fscache_readpage_cancel(inode, page);
+ if (err == -EBLACKLISTED)
+ fsc->blacklisted = true;
goto out;
}
if (err < PAGE_SIZE)
int i;
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
+ if (rc == -EBLACKLISTED)
+ ceph_inode_to_client(inode)->blacklisted = true;
/* unlock all pages, zeroing any data we didn't read */
osd_data = osd_req_op_extent_osd_data(req, 0);
end_page_writeback(page);
return err;
}
+ if (err == -EBLACKLISTED)
+ fsc->blacklisted = true;
dout("writepage setting page/mapping error %d %p\n",
err, page);
SetPageError(page);
if (rc < 0) {
mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
+ if (rc == -EBLACKLISTED)
+ fsc->blacklisted = true;
} else {
ceph_clear_error_write(ci);
}
if (err >= 0 || err == -ENOENT)
have |= POOL_READ;
- else if (err != -EPERM)
+ else if (err != -EPERM) {
+ if (err == -EBLACKLISTED)
+ fsc->blacklisted = true;
goto out_unlock;
+ }
if (err2 == 0 || err2 == -EEXIST)
have |= POOL_WRITE;
else if (err2 != -EPERM) {
+ if (err2 == -EBLACKLISTED)
+ fsc->blacklisted = true;
err = err2;
goto out_unlock;
}
ceph_release_page_vector(pages, num_pages);
}
- if (ret <= 0 || off >= i_size || !more)
+ if (ret < 0) {
+ if (ret == -EBLACKLISTED)
+ fsc->blacklisted = true;
+ break;
+ }
+
+ if (off >= i_size || !more)
break;
}
pr_err("mdsc_handle_forward decode error err=%d\n", err);
}
-static int __decode_and_drop_session_metadata(void **p, void *end)
+static int __decode_session_metadata(void **p, void *end,
+ bool *blacklisted)
{
/* map<string,string> */
u32 n;
+ bool err_str;
ceph_decode_32_safe(p, end, n, bad);
while (n-- > 0) {
u32 len;
ceph_decode_32_safe(p, end, len, bad);
ceph_decode_need(p, end, len, bad);
+ err_str = !strncmp(*p, "error_string", len);
*p += len;
ceph_decode_32_safe(p, end, len, bad);
ceph_decode_need(p, end, len, bad);
+ if (err_str && strnstr(*p, "blacklisted", len))
+ *blacklisted = true;
*p += len;
}
return 0;
u64 seq;
unsigned long features = 0;
int wake = 0;
+ bool blacklisted = false;
/* decode */
ceph_decode_need(&p, end, sizeof(*h), bad);
if (msg_version >= 3) {
u32 len;
/* version >= 2, metadata */
- if (__decode_and_drop_session_metadata(&p, end) < 0)
+ if (__decode_session_metadata(&p, end, &blacklisted) < 0)
goto bad;
/* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad);
session->s_state = CEPH_MDS_SESSION_REJECTED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
+ if (blacklisted)
+ mdsc->fsc->blacklisted = true;
wake = 2; /* for good measure */
break;
mutex_unlock(&mdsc->mutex);
}
+static void maybe_recover_session(struct ceph_mds_client *mdsc)
+{
+ struct ceph_fs_client *fsc = mdsc->fsc;
+
+ if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
+ return;
+
+ if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
+ return;
+ if (!READ_ONCE(fsc->blacklisted))
+ return;
+
+ if (fsc->last_auto_reconnect &&
+ time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
+ return;
+
+ pr_info("auto reconnect after blacklisted\n");
+ fsc->last_auto_reconnect = jiffies;
+ ceph_force_reconnect(fsc->sb);
+}
/*
* delayed work -- periodically trim expired leases, renew caps with mds
ceph_trim_snapid_map(mdsc);
+ maybe_recover_session(mdsc);
+
schedule_delayed(mdsc);
}
Opt_snapdirname,
Opt_mds_namespace,
Opt_fscache_uniq,
+ Opt_recover_session,
Opt_last_string,
/* string args above */
Opt_dirstat,
/* int args above */
{Opt_snapdirname, "snapdirname=%s"},
{Opt_mds_namespace, "mds_namespace=%s"},
+ {Opt_recover_session, "recover_session=%s"},
{Opt_fscache_uniq, "fsc=%s"},
/* string args above */
{Opt_dirstat, "dirstat"},
if (!fsopt->mds_namespace)
return -ENOMEM;
break;
+ case Opt_recover_session:
+ if (!strncmp(argstr[0].from, "no",
+ argstr[0].to - argstr[0].from)) {
+ fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER;
+ } else if (!strncmp(argstr[0].from, "clean",
+ argstr[0].to - argstr[0].from)) {
+ fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER;
+ } else {
+ return -EINVAL;
+ }
+ break;
case Opt_fscache_uniq:
kfree(fsopt->fscache_uniq);
fsopt->fscache_uniq = kstrndup(argstr[0].from,
if (fsopt->mds_namespace)
seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
+ seq_show_option(m, "recover_session", "clean");
+
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_MAX_READ_SIZE)
ceph_reset_client_addr(fsc->client);
ceph_osdc_clear_abort_err(&fsc->client->osdc);
+
+ fsc->blacklisted = false;
fsc->mount_state = CEPH_MOUNT_MOUNTED;
if (sb->s_root) {
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
+#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
unsigned long mount_state;
+ unsigned long last_auto_reconnect;
+ bool blacklisted;
+
u32 filp_gen;
loff_t max_file_size;