libceph, ceph: get and handle cluster maps with addrvecs
authorIlya Dryomov <idryomov@gmail.com>
Fri, 30 Oct 2020 12:30:51 +0000 (13:30 +0100)
committerIlya Dryomov <idryomov@gmail.com>
Mon, 14 Dec 2020 22:21:50 +0000 (23:21 +0100)
In preparation for msgr2, make the cluster send us maps with addrvecs
including both LEGACY and MSGR2 addrs instead of a single LEGACY addr.
This means advertising support for SERVER_NAUTILUS and also some older
features: SERVER_MIMIC, MONENC and MONNAMES.

MONNAMES and MONENC are actually pre-argonaut, we just never updated
ceph_monmap_decode() for them.  Decoding is unconditional, see commit
23c625ce3065 ("libceph: assume argonaut on the server side").

SERVER_MIMIC doesn't bear any meaning for the kernel client.

Since ceph_decode_entity_addrvec() is guarded by encoding version
checks (and in msgr2 case it is guarded implicitly by the fact that
server is speaking msgr2), we assume MSG_ADDR2 for it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
fs/ceph/mds_client.c
fs/ceph/mdsmap.c
include/linux/ceph/ceph_features.h
include/linux/ceph/decode.h
include/linux/ceph/mdsmap.h
include/linux/ceph/osdmap.h
net/ceph/decode.c
net/ceph/mon_client.c
net/ceph/osd_client.c
net/ceph/osdmap.c

index 278fe67..afd2281 100644 (file)
@@ -5014,7 +5014,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
                return;
        }
 
-       newmap = ceph_mdsmap_decode(&p, end);
+       newmap = ceph_mdsmap_decode(&p, end, false);
        if (IS_ERR(newmap)) {
                err = PTR_ERR(newmap);
                goto bad_unlock;
index 1096d1d..abd9af7 100644 (file)
@@ -114,7 +114,7 @@ bad:
  * Ignore any fields we don't care about (there are quite a few of
  * them).
  */
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
 {
        struct ceph_mdsmap *m;
        const void *start = *p;
@@ -201,18 +201,19 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                namelen = ceph_decode_32(p);  /* skip mds name */
                *p += namelen;
 
-               ceph_decode_need(p, end,
-                                4*sizeof(u32) + sizeof(u64) +
-                                sizeof(addr) + sizeof(struct ceph_timespec),
-                                bad);
-               mds = ceph_decode_32(p);
-               inc = ceph_decode_32(p);
-               state = ceph_decode_32(p);
+               ceph_decode_32_safe(p, end, mds, bad);
+               ceph_decode_32_safe(p, end, inc, bad);
+               ceph_decode_32_safe(p, end, state, bad);
                *p += sizeof(u64);              /* state_seq */
-               err = ceph_decode_entity_addr(p, end, &addr);
+               if (info_v >= 8)
+                       err = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+               else
+                       err = ceph_decode_entity_addr(p, end, &addr);
                if (err)
                        goto corrupt;
-               ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+
+               ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since),
+                                     bad);
                laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
                *p += sizeof(u32);
                ceph_decode_32_safe(p, end, namelen, bad);
index 999636d..3a47acd 100644 (file)
@@ -8,7 +8,8 @@
  * feature.  Base case is 1 (first use).
  */
 #define CEPH_FEATURE_INCARNATION_1 (0ull)
-#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57)              // SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC
 
 #define DEFINE_CEPH_FEATURE(bit, incarnation, name)                    \
        static const uint64_t __maybe_unused CEPH_FEATURE_##name = (1ULL<<bit);         \
@@ -75,7 +76,7 @@
 DEFINE_CEPH_FEATURE( 0, 1, UID)
 DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)
 DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
-
+DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS)
 DEFINE_CEPH_FEATURE( 3, 1, FLOCK)
 DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)
 DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
@@ -114,7 +115,7 @@ DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)
 DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
 DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE)
 DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
-DEFINE_CEPH_FEATURE(28, 2, SERVER_M)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC)
 DEFINE_CEPH_FEATURE(29, 1, MDSENC)
 DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)
 DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS)  // deprecate me
@@ -177,13 +178,16 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT                \
        (CEPH_FEATURE_NOSRCADDR |               \
+        CEPH_FEATURE_SERVER_NAUTILUS |         \
         CEPH_FEATURE_FLOCK |                   \
         CEPH_FEATURE_SUBSCRIBE2 |              \
+        CEPH_FEATURE_MONNAMES |                \
         CEPH_FEATURE_RECONNECT_SEQ |           \
         CEPH_FEATURE_DIRLAYOUTHASH |           \
         CEPH_FEATURE_PGID64 |                  \
         CEPH_FEATURE_PGPOOL3 |                 \
         CEPH_FEATURE_OSDENC |                  \
+        CEPH_FEATURE_MONENC |                  \
         CEPH_FEATURE_CRUSH_TUNABLES |          \
         CEPH_FEATURE_SERVER_LUMINOUS |         \
         CEPH_FEATURE_RESEND_ON_SPLIT |         \
@@ -193,6 +197,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
         CEPH_FEATURE_MSG_AUTH |                \
         CEPH_FEATURE_CRUSH_TUNABLES2 |         \
         CEPH_FEATURE_REPLY_CREATE_INODE |      \
+        CEPH_FEATURE_SERVER_MIMIC |            \
         CEPH_FEATURE_MDSENC |                  \
         CEPH_FEATURE_OSDHASHPSPOOL |           \
         CEPH_FEATURE_OSD_CACHEPOOL |           \
index 450384f..9a934e0 100644 (file)
@@ -220,6 +220,7 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
  */
 #define CEPH_ENTITY_ADDR_TYPE_NONE     0
 #define CEPH_ENTITY_ADDR_TYPE_LEGACY   __cpu_to_le32(1)
+#define CEPH_ENTITY_ADDR_TYPE_MSGR2    __cpu_to_le32(2)
 
 static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
 {
@@ -239,6 +240,9 @@ static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a)
 
 extern int ceph_decode_entity_addr(void **p, void *end,
                                   struct ceph_entity_addr *addr);
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+                              struct ceph_entity_addr *addr);
+
 /*
  * encoders
  */
index 35d3852..523fd04 100644 (file)
@@ -64,7 +64,7 @@ static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
 }
 
 extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2);
 extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
 extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
 
index cad9acf..5553019 100644 (file)
@@ -251,8 +251,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
 }
 
 struct ceph_osdmap *ceph_osdmap_alloc(void);
-extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
                                             struct ceph_osdmap *map);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
index eea5295..6429b67 100644 (file)
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/ceph/decode.h>
 
@@ -82,3 +83,58 @@ bad:
 }
 EXPORT_SYMBOL(ceph_decode_entity_addr);
 
+/*
+ * Return addr of desired type (MSGR2 or LEGACY) or error.
+ * Make sure there is only one match.
+ *
+ * Assume encoding with MSG_ADDR2.
+ */
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+                              struct ceph_entity_addr *addr)
+{
+       __le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 :
+                                CEPH_ENTITY_ADDR_TYPE_LEGACY;
+       struct ceph_entity_addr tmp_addr;
+       int addr_cnt;
+       bool found;
+       u8 marker;
+       int ret;
+       int i;
+
+       ceph_decode_8_safe(p, end, marker, e_inval);
+       if (marker != 2) {
+               pr_err("bad addrvec marker %d\n", marker);
+               return -EINVAL;
+       }
+
+       ceph_decode_32_safe(p, end, addr_cnt, e_inval);
+
+       found = false;
+       for (i = 0; i < addr_cnt; i++) {
+               ret = ceph_decode_entity_addr(p, end, &tmp_addr);
+               if (ret)
+                       return ret;
+
+               if (tmp_addr.type == my_type) {
+                       if (found) {
+                               pr_err("another match of type %d in addrvec\n",
+                                      le32_to_cpu(my_type));
+                               return -EINVAL;
+                       }
+
+                       memcpy(addr, &tmp_addr, sizeof(*addr));
+                       found = true;
+               }
+       }
+       if (!found && addr_cnt != 0) {
+               pr_err("no match of type %d in addrvec\n",
+                      le32_to_cpu(my_type));
+               return -ENOENT;
+       }
+
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addrvec);
index ebfecf8..a9754a7 100644 (file)
@@ -36,57 +36,122 @@ static const struct ceph_connection_operations mon_con_ops;
 
 static int __validate_auth(struct ceph_mon_client *monc);
 
+static int decode_mon_info(void **p, void *end, bool msgr2,
+                          struct ceph_entity_addr *addr)
+{
+       void *mon_info_end;
+       u32 struct_len;
+       u8 struct_v;
+       int ret;
+
+       ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v,
+                                 &struct_len);
+       if (ret)
+               return ret;
+
+       mon_info_end = *p + struct_len;
+       ceph_decode_skip_string(p, end, e_inval);  /* skip mon name */
+       ret = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+       if (ret)
+               return ret;
+
+       *p = mon_info_end;
+       return 0;
+
+e_inval:
+       return -EINVAL;
+}
+
 /*
  * Decode a monmap blob (e.g., during mount).
+ *
+ * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC).
  */
-static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2)
 {
-       struct ceph_monmap *m = NULL;
-       int i, err = -EINVAL;
+       struct ceph_monmap *monmap = NULL;
        struct ceph_fsid fsid;
-       u32 epoch, num_mon;
-       u32 len;
+       u32 struct_len;
+       int blob_len;
+       int num_mon;
+       u8 struct_v;
+       u32 epoch;
+       int ret;
+       int i;
+
+       ceph_decode_32_safe(p, end, blob_len, e_inval);
+       ceph_decode_need(p, end, blob_len, e_inval);
+
+       ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len);
+       if (ret)
+               goto fail;
+
+       dout("%s struct_v %d\n", __func__, struct_v);
+       ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval);
+       ceph_decode_32_safe(p, end, epoch, e_inval);
+       if (struct_v >= 6) {
+               u32 feat_struct_len;
+               u8 feat_struct_v;
 
-       ceph_decode_32_safe(&p, end, len, bad);
-       ceph_decode_need(&p, end, len, bad);
+               *p += sizeof(struct ceph_timespec);  /* skip last_changed */
+               *p += sizeof(struct ceph_timespec);  /* skip created */
 
-       dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
-       p += sizeof(u16);  /* skip version */
+               ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+                                         &feat_struct_v, &feat_struct_len);
+               if (ret)
+                       goto fail;
 
-       ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
-       ceph_decode_copy(&p, &fsid, sizeof(fsid));
-       epoch = ceph_decode_32(&p);
+               *p += feat_struct_len;  /* skip persistent_features */
 
-       num_mon = ceph_decode_32(&p);
+               ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+                                         &feat_struct_v, &feat_struct_len);
+               if (ret)
+                       goto fail;
 
+               *p += feat_struct_len;  /* skip optional_features */
+       }
+       ceph_decode_32_safe(p, end, num_mon, e_inval);
+
+       dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch,
+            num_mon);
        if (num_mon > CEPH_MAX_MON)
-               goto bad;
-       m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS);
-       if (m == NULL)
-               return ERR_PTR(-ENOMEM);
-       m->fsid = fsid;
-       m->epoch = epoch;
-       m->num_mon = num_mon;
-       for (i = 0; i < num_mon; ++i) {
-               struct ceph_entity_inst *inst = &m->mon_inst[i];
-
-               /* copy name portion */
-               ceph_decode_copy_safe(&p, end, &inst->name,
-                                       sizeof(inst->name), bad);
-               err = ceph_decode_entity_addr(&p, end, &inst->addr);
-               if (err)
-                       goto bad;
+               goto e_inval;
+
+       monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO);
+       if (!monmap) {
+               ret = -ENOMEM;
+               goto fail;
        }
-       dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
-            m->num_mon);
-       for (i = 0; i < m->num_mon; i++)
-               dout("monmap_decode  mon%d is %s\n", i,
-                    ceph_pr_addr(&m->mon_inst[i].addr));
-       return m;
-bad:
-       dout("monmap_decode failed with %d\n", err);
-       kfree(m);
-       return ERR_PTR(err);
+       monmap->fsid = fsid;
+       monmap->epoch = epoch;
+       monmap->num_mon = num_mon;
+
+       /* legacy_mon_addr map or mon_info map */
+       for (i = 0; i < num_mon; i++) {
+               struct ceph_entity_inst *inst = &monmap->mon_inst[i];
+
+               ceph_decode_skip_string(p, end, e_inval);  /* skip mon name */
+               inst->name.type = CEPH_ENTITY_TYPE_MON;
+               inst->name.num = cpu_to_le64(i);
+
+               if (struct_v >= 6)
+                       ret = decode_mon_info(p, end, msgr2, &inst->addr);
+               else
+                       ret = ceph_decode_entity_addr(p, end, &inst->addr);
+               if (ret)
+                       goto fail;
+
+               dout("%s mon%d addr %s\n", __func__, i,
+                    ceph_pr_addr(&inst->addr));
+       }
+
+       return monmap;
+
+e_inval:
+       ret = -EINVAL;
+fail:
+       kfree(monmap);
+       return ERR_PTR(ret);
 }
 
 /*
@@ -476,7 +541,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
        p = msg->front.iov_base;
        end = p + msg->front.iov_len;
 
-       monmap = ceph_monmap_decode(p, end);
+       monmap = ceph_monmap_decode(&p, end, false);
        if (IS_ERR(monmap)) {
                pr_err("problem decoding monmap, %d\n",
                       (int)PTR_ERR(monmap));
index 8966eae..51be5a7 100644 (file)
@@ -3918,9 +3918,9 @@ static int handle_one_map(struct ceph_osd_client *osdc,
        set_pool_was_full(osdc);
 
        if (incremental)
-               newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+               newmap = osdmap_apply_incremental(&p, end, false, osdc->osdmap);
        else
-               newmap = ceph_osdmap_decode(&p, end);
+               newmap = ceph_osdmap_decode(&p, end, false);
        if (IS_ERR(newmap))
                return PTR_ERR(newmap);
 
index fa08c15..2b1dd25 100644 (file)
@@ -1647,7 +1647,8 @@ static int decode_old_pg_upmap_items(void **p, void *end,
 /*
  * decode a full map.
  */
-static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+static int osdmap_decode(void **p, void *end, bool msgr2,
+                        struct ceph_osdmap *map)
 {
        u8 struct_v;
        u32 epoch = 0;
@@ -1718,9 +1719,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
                goto e_inval;
 
        for (i = 0; i < map->max_osd; i++) {
-               err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
+               struct ceph_entity_addr *addr = &map->osd_addr[i];
+
+               if (struct_v >= 8)
+                       err = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+               else
+                       err = ceph_decode_entity_addr(p, end, addr);
                if (err)
                        goto bad;
+
+               dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr));
        }
 
        /* pg_temp */
@@ -1790,7 +1798,7 @@ bad:
 /*
  * Allocate and decode a full map.
  */
-struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2)
 {
        struct ceph_osdmap *map;
        int ret;
@@ -1799,7 +1807,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
        if (!map)
                return ERR_PTR(-ENOMEM);
 
-       ret = osdmap_decode(p, end, map);
+       ret = osdmap_decode(p, end, msgr2, map);
        if (ret) {
                ceph_osdmap_destroy(map);
                return ERR_PTR(ret);
@@ -1817,12 +1825,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
  *     new_state: { osd=6, xorstate=EXISTS } # clear osd_state
  */
 static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
-                                     struct ceph_osdmap *map)
+                                     bool msgr2, struct ceph_osdmap *map)
 {
        void *new_up_client;
        void *new_state;
        void *new_weight_end;
        u32 len;
+       int ret;
        int i;
 
        new_up_client = *p;
@@ -1831,8 +1840,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
                struct ceph_entity_addr addr;
 
                ceph_decode_skip_32(p, end, e_inval);
-               if (ceph_decode_entity_addr(p, end, &addr))
-                       goto e_inval;
+               if (struct_v >= 7)
+                       ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+               else
+                       ret = ceph_decode_entity_addr(p, end, &addr);
+               if (ret)
+                       return ret;
        }
 
        new_state = *p;
@@ -1874,7 +1887,6 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
        while (len--) {
                s32 osd;
                u32 xorstate;
-               int ret;
 
                osd = ceph_decode_32(p);
                if (struct_v >= 5)
@@ -1910,8 +1922,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 
                osd = ceph_decode_32(p);
                BUG_ON(osd >= map->max_osd);
-               if (ceph_decode_entity_addr(p, end, &addr))
-                       goto e_inval;
+               if (struct_v >= 7)
+                       ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+               else
+                       ret = ceph_decode_entity_addr(p, end, &addr);
+               if (ret)
+                       return ret;
+
+               dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
+
                pr_info("osd%d up\n", osd);
                map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
                map->osd_addr[osd] = addr;
@@ -1927,7 +1946,7 @@ e_inval:
 /*
  * decode and apply an incremental map update.
  */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
                                             struct ceph_osdmap *map)
 {
        struct ceph_fsid fsid;
@@ -1962,7 +1981,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        if (len > 0) {
                dout("apply_incremental full map len %d, %p to %p\n",
                     len, *p, end);
-               return ceph_osdmap_decode(p, min(*p+len, end));
+               return ceph_osdmap_decode(p, min(*p+len, end), msgr2);
        }
 
        /* new crush? */
@@ -2014,7 +2033,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        }
 
        /* new_up_client, new_state, new_weight */
-       err = decode_new_up_state_weight(p, end, struct_v, map);
+       err = decode_new_up_state_weight(p, end, struct_v, msgr2, map);
        if (err)
                goto bad;