rds: Changing IP address internal representation to struct in6_addr
authorKa-Cheong Poon <ka-cheong.poon@oracle.com>
Tue, 24 Jul 2018 03:51:21 +0000 (20:51 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 24 Jul 2018 04:17:44 +0000 (21:17 -0700)
This patch changes the internal representation of an IP address to use
struct in6_addr.  IPv4 address is stored as an IPv4 mapped address.
All the functions which take an IP address as argument are also
changed to use struct in6_addr.  But RDS socket layer is not modified
such that it still does not accept IPv6 address from an application.
And RDS layer does not accept nor initiate IPv6 connections.

v2: Fixed sparse warnings.

Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
23 files changed:
net/rds/af_rds.c
net/rds/bind.c
net/rds/cong.c
net/rds/connection.c
net/rds/ib.c
net/rds/ib.h
net/rds/ib_cm.c
net/rds/ib_rdma.c
net/rds/ib_recv.c
net/rds/ib_send.c
net/rds/loop.c
net/rds/rdma.c
net/rds/rdma_transport.c
net/rds/rds.h
net/rds/recv.c
net/rds/send.c
net/rds/tcp.c
net/rds/tcp_connect.c
net/rds/tcp_listen.c
net/rds/tcp_recv.c
net/rds/tcp_send.c
net/rds/threads.c
net/rds/transport.c

index ab751a1..fc1a5c6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
 #include <linux/kernel.h>
 #include <linux/gfp.h>
 #include <linux/in.h>
+#include <linux/ipv6.h>
 #include <linux/poll.h>
 #include <net/sock.h>
 
@@ -113,26 +114,63 @@ void rds_wake_sk_sleep(struct rds_sock *rs)
 static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
                       int peer)
 {
-       struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
-
-       memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+       struct sockaddr_in6 *sin6;
+       struct sockaddr_in *sin;
+       int uaddr_len;
 
        /* racey, don't care */
        if (peer) {
-               if (!rs->rs_conn_addr)
+               if (ipv6_addr_any(&rs->rs_conn_addr))
                        return -ENOTCONN;
 
-               sin->sin_port = rs->rs_conn_port;
-               sin->sin_addr.s_addr = rs->rs_conn_addr;
+               if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
+                       sin = (struct sockaddr_in *)uaddr;
+                       memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+                       sin->sin_family = AF_INET;
+                       sin->sin_port = rs->rs_conn_port;
+                       sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
+                       uaddr_len = sizeof(*sin);
+               } else {
+                       sin6 = (struct sockaddr_in6 *)uaddr;
+                       sin6->sin6_family = AF_INET6;
+                       sin6->sin6_port = rs->rs_conn_port;
+                       sin6->sin6_addr = rs->rs_conn_addr;
+                       sin6->sin6_flowinfo = 0;
+                       /* scope_id is the same as in the bound address. */
+                       sin6->sin6_scope_id = rs->rs_bound_scope_id;
+                       uaddr_len = sizeof(*sin6);
+               }
        } else {
-               sin->sin_port = rs->rs_bound_port;
-               sin->sin_addr.s_addr = rs->rs_bound_addr;
+               /* If socket is not yet bound, set the return address family
+                * to be AF_UNSPEC (value 0) and the address size to be that
+                * of an IPv4 address.
+                */
+               if (ipv6_addr_any(&rs->rs_bound_addr)) {
+                       sin = (struct sockaddr_in *)uaddr;
+                       memset(sin, 0, sizeof(*sin));
+                       sin->sin_family = AF_UNSPEC;
+                       return sizeof(*sin);
+               }
+               if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
+                       sin = (struct sockaddr_in *)uaddr;
+                       memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+                       sin->sin_family = AF_INET;
+                       sin->sin_port = rs->rs_bound_port;
+                       sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
+                       uaddr_len = sizeof(*sin);
+               } else {
+                       sin6 = (struct sockaddr_in6 *)uaddr;
+                       sin6->sin6_family = AF_INET6;
+                       sin6->sin6_port = rs->rs_bound_port;
+                       sin6->sin6_addr = rs->rs_bound_addr;
+                       sin6->sin6_flowinfo = 0;
+                       sin6->sin6_scope_id = rs->rs_bound_scope_id;
+                       uaddr_len = sizeof(*sin6);
+               }
        }
 
-       sin->sin_family = AF_INET;
-
-       return sizeof(*sin);
+       return uaddr_len;
 }
 
 /*
@@ -203,11 +241,12 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
                              int len)
 {
+       struct sockaddr_in6 sin6;
        struct sockaddr_in sin;
        int ret = 0;
 
        /* racing with another thread binding seems ok here */
-       if (rs->rs_bound_addr == 0) {
+       if (ipv6_addr_any(&rs->rs_bound_addr)) {
                ret = -ENOTCONN; /* XXX not a great errno */
                goto out;
        }
@@ -215,14 +254,23 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
        if (len < sizeof(struct sockaddr_in)) {
                ret = -EINVAL;
                goto out;
+       } else if (len < sizeof(struct sockaddr_in6)) {
+               /* Assume IPv4 */
+               if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+               ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
+               sin6.sin6_port = sin.sin_port;
+       } else {
+               if (copy_from_user(&sin6, optval,
+                                  sizeof(struct sockaddr_in6))) {
+                       ret = -EFAULT;
+                       goto out;
+               }
        }
 
-       if (copy_from_user(&sin, optval, sizeof(sin))) {
-               ret = -EFAULT;
-               goto out;
-       }
-
-       rds_send_drop_to(rs, &sin);
+       rds_send_drop_to(rs, &sin6);
 out:
        return ret;
 }
@@ -435,31 +483,41 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
                       int addr_len, int flags)
 {
        struct sock *sk = sock->sk;
-       struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+       struct sockaddr_in *sin;
        struct rds_sock *rs = rds_sk_to_rs(sk);
        int ret = 0;
 
        lock_sock(sk);
 
-       if (addr_len != sizeof(struct sockaddr_in)) {
-               ret = -EINVAL;
-               goto out;
-       }
+       switch (addr_len) {
+       case sizeof(struct sockaddr_in):
+               sin = (struct sockaddr_in *)uaddr;
+               if (sin->sin_family != AF_INET) {
+                       ret = -EAFNOSUPPORT;
+                       break;
+               }
+               if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+                       ret = -EDESTADDRREQ;
+                       break;
+               }
+               if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) ||
+                   sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
+                       ret = -EINVAL;
+                       break;
+               }
+               ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
+               rs->rs_conn_port = sin->sin_port;
+               break;
 
-       if (sin->sin_family != AF_INET) {
-               ret = -EAFNOSUPPORT;
-               goto out;
-       }
+       case sizeof(struct sockaddr_in6):
+               ret = -EPROTONOSUPPORT;
+               break;
 
-       if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
-               ret = -EDESTADDRREQ;
-               goto out;
+       default:
+               ret = -EINVAL;
+               break;
        }
 
-       rs->rs_conn_addr = sin->sin_addr.s_addr;
-       rs->rs_conn_port = sin->sin_port;
-
-out:
        release_sock(sk);
        return ret;
 }
@@ -578,8 +636,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
                list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
                        total++;
                        if (total <= len)
-                               rds_inc_info_copy(inc, iter, inc->i_saddr,
-                                                 rs->rs_bound_addr, 1);
+                               rds_inc_info_copy(inc, iter,
+                                                 inc->i_saddr.s6_addr32[3],
+                                                 rs->rs_bound_addr_v4,
+                                                 1);
                }
 
                read_unlock(&rs->rs_recv_lock);
@@ -608,8 +668,8 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
        list_for_each_entry(rs, &rds_sock_list, rs_item) {
                sinfo.sndbuf = rds_sk_sndbuf(rs);
                sinfo.rcvbuf = rds_sk_rcvbuf(rs);
-               sinfo.bound_addr = rs->rs_bound_addr;
-               sinfo.connected_addr = rs->rs_conn_addr;
+               sinfo.bound_addr = rs->rs_bound_addr_v4;
+               sinfo.connected_addr = rs->rs_conn_addr_v4;
                sinfo.bound_port = rs->rs_bound_port;
                sinfo.connected_port = rs->rs_conn_port;
                sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
index 5aa3a64..c401776 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
 #include <linux/kernel.h>
 #include <net/sock.h>
 #include <linux/in.h>
+#include <linux/ipv6.h>
 #include <linux/if_arp.h>
 #include <linux/jhash.h>
 #include <linux/ratelimit.h>
@@ -42,42 +43,58 @@ static struct rhashtable bind_hash_table;
 
 static const struct rhashtable_params ht_parms = {
        .nelem_hint = 768,
-       .key_len = sizeof(u64),
+       .key_len = RDS_BOUND_KEY_LEN,
        .key_offset = offsetof(struct rds_sock, rs_bound_key),
        .head_offset = offsetof(struct rds_sock, rs_bound_node),
        .max_size = 16384,
        .min_size = 1024,
 };
 
+/* Create a key for the bind hash table manipulation.  Port is in network byte
+ * order.
+ */
+static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr,
+                                        __be16 port, __u32 scope_id)
+{
+       memcpy(key, addr, sizeof(*addr));
+       key += sizeof(*addr);
+       memcpy(key, &port, sizeof(port));
+       key += sizeof(port);
+       memcpy(key, &scope_id, sizeof(scope_id));
+}
+
 /*
  * Return the rds_sock bound at the given local address.
  *
  * The rx path can race with rds_release.  We notice if rds_release() has
  * marked this socket and don't return a rs ref to the rx path.
  */
-struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
+                               __u32 scope_id)
 {
-       u64 key = ((u64)addr << 32) | port;
+       u8 key[RDS_BOUND_KEY_LEN];
        struct rds_sock *rs;
 
-       rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms);
+       __rds_create_bind_key(key, addr, port, scope_id);
+       rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms);
        if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
                rds_sock_addref(rs);
        else
                rs = NULL;
 
-       rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
-               ntohs(port));
+       rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
+                ntohs(port));
 
        return rs;
 }
 
 /* returns -ve errno or +ve port */
-static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
+                        __be16 *port, __u32 scope_id)
 {
        int ret = -EADDRINUSE;
        u16 rover, last;
-       u64 key;
+       u8 key[RDS_BOUND_KEY_LEN];
 
        if (*port != 0) {
                rover = be16_to_cpu(*port);
@@ -95,12 +112,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
 
                if (rover == RDS_FLAG_PROBE_PORT)
                        continue;
-               key = ((u64)addr << 32) | cpu_to_be16(rover);
-               if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms))
+               __rds_create_bind_key(key, addr, cpu_to_be16(rover),
+                                     scope_id);
+               if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms))
                        continue;
 
-               rs->rs_bound_key = key;
-               rs->rs_bound_addr = addr;
+               memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key));
+               rs->rs_bound_addr = *addr;
                net_get_random_once(&rs->rs_hash_initval,
                                    sizeof(rs->rs_hash_initval));
                rs->rs_bound_port = cpu_to_be16(rover);
@@ -114,7 +132,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
                          rs, &addr, (int)ntohs(*port));
                        break;
                } else {
-                       rs->rs_bound_addr = 0;
+                       rs->rs_bound_addr = in6addr_any;
                        rds_sock_put(rs);
                        ret = -ENOMEM;
                        break;
@@ -127,44 +145,61 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
 void rds_remove_bound(struct rds_sock *rs)
 {
 
-       if (!rs->rs_bound_addr)
+       if (ipv6_addr_any(&rs->rs_bound_addr))
                return;
 
-       rdsdebug("rs %p unbinding from %pI4:%d\n",
+       rdsdebug("rs %p unbinding from %pI6c:%d\n",
                 rs, &rs->rs_bound_addr,
                 ntohs(rs->rs_bound_port));
 
        rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
        rds_sock_put(rs);
-       rs->rs_bound_addr = 0;
+       rs->rs_bound_addr = in6addr_any;
 }
 
 int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
        struct sock *sk = sock->sk;
-       struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
        struct rds_sock *rs = rds_sk_to_rs(sk);
+       struct in6_addr v6addr, *binding_addr;
        struct rds_transport *trans;
+       __u32 scope_id = 0;
        int ret = 0;
+       __be16 port;
 
+       /* We only allow an RDS socket to be bound to an IPv4 address. IPv6
+        * address support will be added later.
+        */
+       if (addr_len == sizeof(struct sockaddr_in)) {
+               struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+
+               if (sin->sin_family != AF_INET ||
+                   sin->sin_addr.s_addr == htonl(INADDR_ANY))
+                       return -EINVAL;
+               ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr);
+               binding_addr = &v6addr;
+               port = sin->sin_port;
+       } else if (addr_len == sizeof(struct sockaddr_in6)) {
+               return -EPROTONOSUPPORT;
+       } else {
+               return -EINVAL;
+       }
        lock_sock(sk);
 
-       if (addr_len != sizeof(struct sockaddr_in) ||
-           sin->sin_family != AF_INET ||
-           rs->rs_bound_addr ||
-           sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+       /* RDS socket does not allow re-binding. */
+       if (!ipv6_addr_any(&rs->rs_bound_addr)) {
                ret = -EINVAL;
                goto out;
        }
 
-       ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+       ret = rds_add_bound(rs, binding_addr, &port, scope_id);
        if (ret)
                goto out;
 
        if (rs->rs_transport) { /* previously bound */
                trans = rs->rs_transport;
                if (trans->laddr_check(sock_net(sock->sk),
-                                      sin->sin_addr.s_addr) != 0) {
+                                      binding_addr, scope_id) != 0) {
                        ret = -ENOPROTOOPT;
                        rds_remove_bound(rs);
                } else {
@@ -172,13 +207,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
                }
                goto out;
        }
-       trans = rds_trans_get_preferred(sock_net(sock->sk),
-                                       sin->sin_addr.s_addr);
+       trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr,
+                                       scope_id);
        if (!trans) {
                ret = -EADDRNOTAVAIL;
                rds_remove_bound(rs);
-               pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
-                                   __func__, &sin->sin_addr.s_addr);
+               pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n",
+                                   __func__, binding_addr);
                goto out;
        }
 
index 63da9d2..ccdff09 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Oracle.  All rights reserved.
+ * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -101,7 +101,7 @@ static DEFINE_RWLOCK(rds_cong_monitor_lock);
 static DEFINE_SPINLOCK(rds_cong_lock);
 static struct rb_root rds_cong_tree = RB_ROOT;
 
-static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr,
                                               struct rds_cong_map *insert)
 {
        struct rb_node **p = &rds_cong_tree.rb_node;
@@ -109,12 +109,15 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
        struct rds_cong_map *map;
 
        while (*p) {
+               int diff;
+
                parent = *p;
                map = rb_entry(parent, struct rds_cong_map, m_rb_node);
 
-               if (addr < map->m_addr)
+               diff = rds_addr_cmp(addr, &map->m_addr);
+               if (diff < 0)
                        p = &(*p)->rb_left;
-               else if (addr > map->m_addr)
+               else if (diff > 0)
                        p = &(*p)->rb_right;
                else
                        return map;
@@ -132,7 +135,7 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
  * these bitmaps in the process getting pointers to them.  The bitmaps are only
  * ever freed as the module is removed after all connections have been freed.
  */
-static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr)
 {
        struct rds_cong_map *map;
        struct rds_cong_map *ret = NULL;
@@ -144,7 +147,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
        if (!map)
                return NULL;
 
-       map->m_addr = addr;
+       map->m_addr = *addr;
        init_waitqueue_head(&map->m_waitq);
        INIT_LIST_HEAD(&map->m_conn_list);
 
@@ -171,7 +174,7 @@ out:
                kfree(map);
        }
 
-       rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+       rdsdebug("map %p for addr %pI6c\n", ret, addr);
 
        return ret;
 }
@@ -202,8 +205,8 @@ void rds_cong_remove_conn(struct rds_connection *conn)
 
 int rds_cong_get_maps(struct rds_connection *conn)
 {
-       conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
-       conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+       conn->c_lcong = rds_cong_from_addr(&conn->c_laddr);
+       conn->c_fcong = rds_cong_from_addr(&conn->c_faddr);
 
        if (!(conn->c_lcong && conn->c_fcong))
                return -ENOMEM;
@@ -353,7 +356,7 @@ void rds_cong_remove_socket(struct rds_sock *rs)
 
        /* update congestion map for now-closed port */
        spin_lock_irqsave(&rds_cong_lock, flags);
-       map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+       map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL);
        spin_unlock_irqrestore(&rds_cong_lock, flags);
 
        if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
index cfb0595..3176ead 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -34,7 +34,8 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <net/inet_hashtables.h>
+#include <net/ipv6.h>
+#include <net/inet6_hashtables.h>
 
 #include "rds.h"
 #include "loop.h"
@@ -49,18 +50,21 @@ static unsigned long rds_conn_count;
 static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
 static struct kmem_cache *rds_conn_slab;
 
-static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr,
+                                         const struct in6_addr *faddr)
 {
+       static u32 rds6_hash_secret __read_mostly;
        static u32 rds_hash_secret __read_mostly;
 
-       unsigned long hash;
+       u32 lhash, fhash, hash;
 
        net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
+       net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
+
+       lhash = (__force u32)laddr->s6_addr32[3];
+       fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret);
+       hash = __inet6_ehashfn(lhash, 0, fhash, 0, rds_hash_secret);
 
-       /* Pass NULL, don't need struct net for hash */
-       hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
-                             be32_to_cpu(faddr), 0,
-                             rds_hash_secret);
        return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
 }
 
@@ -72,20 +76,25 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
 /* rcu read lock must be held or the connection spinlock */
 static struct rds_connection *rds_conn_lookup(struct net *net,
                                              struct hlist_head *head,
-                                             __be32 laddr, __be32 faddr,
-                                             struct rds_transport *trans)
+                                             const struct in6_addr *laddr,
+                                             const struct in6_addr *faddr,
+                                             struct rds_transport *trans,
+                                             int dev_if)
 {
        struct rds_connection *conn, *ret = NULL;
 
        hlist_for_each_entry_rcu(conn, head, c_hash_node) {
-               if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
-                   conn->c_trans == trans && net == rds_conn_net(conn)) {
+               if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
+                   ipv6_addr_equal(&conn->c_laddr, laddr) &&
+                   conn->c_trans == trans &&
+                   net == rds_conn_net(conn) &&
+                   conn->c_dev_if == dev_if) {
                        ret = conn;
                        break;
                }
        }
-       rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
-                &laddr, &faddr);
+       rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret,
+                laddr, faddr);
        return ret;
 }
 
@@ -99,8 +108,8 @@ static void rds_conn_path_reset(struct rds_conn_path *cp)
 {
        struct rds_connection *conn = cp->cp_conn;
 
-       rdsdebug("connection %pI4 to %pI4 reset\n",
-         &conn->c_laddr, &conn->c_faddr);
+       rdsdebug("connection %pI6c to %pI6c reset\n",
+                &conn->c_laddr, &conn->c_faddr);
 
        rds_stats_inc(s_conn_reset);
        rds_send_path_reset(cp);
@@ -142,9 +151,12 @@ static void __rds_conn_path_init(struct rds_connection *conn,
  * are torn down as the module is removed, if ever.
  */
 static struct rds_connection *__rds_conn_create(struct net *net,
-                                               __be32 laddr, __be32 faddr,
-                                      struct rds_transport *trans, gfp_t gfp,
-                                      int is_outgoing)
+                                               const struct in6_addr *laddr,
+                                               const struct in6_addr *faddr,
+                                               struct rds_transport *trans,
+                                               gfp_t gfp,
+                                               int is_outgoing,
+                                               int dev_if)
 {
        struct rds_connection *conn, *parent = NULL;
        struct hlist_head *head = rds_conn_bucket(laddr, faddr);
@@ -154,9 +166,12 @@ static struct rds_connection *__rds_conn_create(struct net *net,
        int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
 
        rcu_read_lock();
-       conn = rds_conn_lookup(net, head, laddr, faddr, trans);
-       if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
-           laddr == faddr && !is_outgoing) {
+       conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
+       if (conn &&
+           conn->c_loopback &&
+           conn->c_trans != &rds_loop_transport &&
+           ipv6_addr_equal(laddr, faddr) &&
+           !is_outgoing) {
                /* This is a looped back IB connection, and we're
                 * called by the code handling the incoming connect.
                 * We need a second connection object into which we
@@ -181,8 +196,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
        }
 
        INIT_HLIST_NODE(&conn->c_hash_node);
-       conn->c_laddr = laddr;
-       conn->c_faddr = faddr;
+       conn->c_laddr = *laddr;
+       conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
+       conn->c_faddr = *faddr;
+       conn->c_dev_if = dev_if;
 
        rds_conn_net_set(conn, net);
 
@@ -199,7 +216,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
         * can bind to the destination address then we'd rather the messages
         * flow through loopback rather than either transport.
         */
-       loop_trans = rds_trans_get_preferred(net, faddr);
+       loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if);
        if (loop_trans) {
                rds_trans_put(loop_trans);
                conn->c_loopback = 1;
@@ -233,10 +250,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
                goto out;
        }
 
-       rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
-         conn, &laddr, &faddr,
-         strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name :
-         "[unknown]", is_outgoing ? "(outgoing)" : "");
+       rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n",
+                conn, laddr, faddr,
+                strnlen(trans->t_name, sizeof(trans->t_name)) ?
+                trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : "");
 
        /*
         * Since we ran without holding the conn lock, someone could
@@ -262,7 +279,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
                /* Creating normal conn */
                struct rds_connection *found;
 
-               found = rds_conn_lookup(net, head, laddr, faddr, trans);
+               found = rds_conn_lookup(net, head, laddr, faddr, trans,
+                                       dev_if);
                if (found) {
                        struct rds_conn_path *cp;
                        int i;
@@ -295,18 +313,22 @@ out:
 }
 
 struct rds_connection *rds_conn_create(struct net *net,
-                                      __be32 laddr, __be32 faddr,
-                                      struct rds_transport *trans, gfp_t gfp)
+                                      const struct in6_addr *laddr,
+                                      const struct in6_addr *faddr,
+                                      struct rds_transport *trans, gfp_t gfp,
+                                      int dev_if)
 {
-       return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
+       return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create);
 
 struct rds_connection *rds_conn_create_outgoing(struct net *net,
-                                               __be32 laddr, __be32 faddr,
-                                      struct rds_transport *trans, gfp_t gfp)
+                                               const struct in6_addr *laddr,
+                                               const struct in6_addr *faddr,
+                                               struct rds_transport *trans,
+                                               gfp_t gfp, int dev_if)
 {
-       return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
+       return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
@@ -502,12 +524,17 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 
                                /* XXX too lazy to maintain counts.. */
                                list_for_each_entry(rm, list, m_conn_item) {
+                                       __be32 laddr;
+                                       __be32 faddr;
+
                                        total++;
+                                       laddr = conn->c_laddr.s6_addr32[3];
+                                       faddr = conn->c_faddr.s6_addr32[3];
                                        if (total <= len)
                                                rds_inc_info_copy(&rm->m_inc,
                                                                  iter,
-                                                                 conn->c_laddr,
-                                                                 conn->c_faddr,
+                                                                 laddr,
+                                                                 faddr,
                                                                  0);
                                }
 
@@ -584,7 +611,6 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
        struct hlist_head *head;
        struct rds_connection *conn;
        size_t i;
-       int j;
 
        rcu_read_lock();
 
@@ -595,17 +621,20 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
             i++, head++) {
                hlist_for_each_entry_rcu(conn, head, c_hash_node) {
                        struct rds_conn_path *cp;
-                       int npaths;
 
-                       npaths = (conn->c_trans->t_mp_capable ?
-                                RDS_MPATH_WORKERS : 1);
-                       for (j = 0; j < npaths; j++) {
-                               cp = &conn->c_path[j];
+                       /* XXX We only copy the information from the first
+                        * path for now.  The problem is that if there are
+                        * more than one underlying paths, we cannot report
+                        * information of all of them using the existing
+                        * API.  For example, there is only one next_tx_seq,
+                        * which path's next_tx_seq should we report?  It is
+                        * a bug in the design of MPRDS.
+                        */
+                       cp = conn->c_path;
 
-                               /* XXX no cp_lock usage.. */
-                               if (!visitor(cp, buffer))
-                                       continue;
-                       }
+                       /* XXX no cp_lock usage.. */
+                       if (!visitor(cp, buffer))
+                               continue;
 
                        /* We copy as much as we can fit in the buffer,
                         * but we count all items so that the caller
@@ -624,12 +653,13 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
 static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
 {
        struct rds_info_connection *cinfo = buffer;
+       struct rds_connection *conn = cp->cp_conn;
 
        cinfo->next_tx_seq = cp->cp_next_tx_seq;
        cinfo->next_rx_seq = cp->cp_next_rx_seq;
-       cinfo->laddr = cp->cp_conn->c_laddr;
-       cinfo->faddr = cp->cp_conn->c_faddr;
-       strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name,
+       cinfo->laddr = conn->c_laddr.s6_addr32[3];
+       cinfo->faddr = conn->c_faddr.s6_addr32[3];
+       strncpy(cinfo->transport, conn->c_trans->t_name,
                sizeof(cinfo->transport));
        cinfo->flags = 0;
 
index b6ad38e..c712a84 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -296,8 +296,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
        if (conn->c_trans != &rds_ib_transport)
                return 0;
 
-       iinfo->src_addr = conn->c_laddr;
-       iinfo->dst_addr = conn->c_faddr;
+       iinfo->src_addr = conn->c_laddr.s6_addr32[3];
+       iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
 
        memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
        memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
@@ -341,7 +341,8 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
  * allowed to influence which paths have priority.  We could call userspace
  * asserting this policy "routing".
  */
-static int rds_ib_laddr_check(struct net *net, __be32 addr)
+static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
+                             __u32 scope_id)
 {
        int ret;
        struct rdma_cm_id *cm_id;
@@ -357,7 +358,7 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
 
        memset(&sin, 0, sizeof(sin));
        sin.sin_family = AF_INET;
-       sin.sin_addr.s_addr = addr;
+       sin.sin_addr.s_addr = addr->s6_addr32[3];
 
        /* rdma_bind_addr will only succeed for IB & iWARP devices */
        ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
@@ -367,9 +368,9 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
            cm_id->device->node_type != RDMA_NODE_IB_CA)
                ret = -EADDRNOTAVAIL;
 
-       rdsdebug("addr %pI4 ret %d node type %d\n",
-               &addr, ret,
-               cm_id->device ? cm_id->device->node_type : -1);
+       rdsdebug("addr %pI6c ret %d node type %d\n",
+                addr, ret,
+                cm_id->device ? cm_id->device->node_type : -1);
 
        rdma_destroy_id(cm_id);
 
index a6f4d7d..beb95b8 100644 (file)
@@ -57,16 +57,44 @@ struct rds_ib_refill_cache {
        struct list_head         *ready;
 };
 
+/* This is the common structure for the IB private data exchange in setting up
+ * an RDS connection.  The exchange is different for IPv4 and IPv6 connections.
+ * The reason is that the address size is different and the addresses
+ * exchanged are in the beginning of the structure.  Hence it is not possible
+ * for interoperability if same structure is used.
+ */
+struct rds_ib_conn_priv_cmn {
+       u8                      ricpc_protocol_major;
+       u8                      ricpc_protocol_minor;
+       __be16                  ricpc_protocol_minor_mask;      /* bitmask */
+       __be32                  ricpc_reserved1;
+       __be64                  ricpc_ack_seq;
+       __be32                  ricpc_credit;   /* non-zero enables flow ctl */
+};
+
 struct rds_ib_connect_private {
        /* Add new fields at the end, and don't permute existing fields. */
-       __be32                  dp_saddr;
-       __be32                  dp_daddr;
-       u8                      dp_protocol_major;
-       u8                      dp_protocol_minor;
-       __be16                  dp_protocol_minor_mask; /* bitmask */
-       __be32                  dp_reserved1;
-       __be64                  dp_ack_seq;
-       __be32                  dp_credit;              /* non-zero enables flow ctl */
+       __be32                          dp_saddr;
+       __be32                          dp_daddr;
+       struct rds_ib_conn_priv_cmn     dp_cmn;
+};
+
+struct rds6_ib_connect_private {
+       /* Add new fields at the end, and don't permute existing fields. */
+       struct in6_addr                 dp_saddr;
+       struct in6_addr                 dp_daddr;
+       struct rds_ib_conn_priv_cmn     dp_cmn;
+};
+
+#define dp_protocol_major      dp_cmn.ricpc_protocol_major
+#define dp_protocol_minor      dp_cmn.ricpc_protocol_minor
+#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask
+#define dp_ack_seq             dp_cmn.ricpc_ack_seq
+#define dp_credit              dp_cmn.ricpc_credit
+
+union rds_ib_conn_priv {
+       struct rds_ib_connect_private   ricp_v4;
+       struct rds6_ib_connect_private  ricp_v6;
 };
 
 struct rds_ib_send_work {
@@ -351,8 +379,8 @@ void rds_ib_listen_stop(void);
 __printf(2, 3)
 void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
-                            struct rdma_cm_event *event);
-int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+                            struct rdma_cm_event *event, bool isv6);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
 void rds_ib_cm_connect_complete(struct rds_connection *conn,
                                struct rdma_cm_event *event);
 
@@ -361,7 +389,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
        __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
 
 /* ib_rdma.c */
-int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
+                        struct in6_addr *ipaddr);
 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_destroy_nodev_conns(void);
index f1684ae..dd8a867 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/ratelimit.h>
+#include <net/addrconf.h>
 
 #include "rds_single_path.h"
 #include "rds.h"
@@ -95,25 +96,45 @@ rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
  */
 void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
 {
-       const struct rds_ib_connect_private *dp = NULL;
        struct rds_ib_connection *ic = conn->c_transport_data;
+       const union rds_ib_conn_priv *dp = NULL;
        struct ib_qp_attr qp_attr;
+       __be64 ack_seq = 0;
+       __be32 credit = 0;
+       u8 major = 0;
+       u8 minor = 0;
        int err;
 
-       if (event->param.conn.private_data_len >= sizeof(*dp)) {
-               dp = event->param.conn.private_data;
-
-               /* make sure it isn't empty data */
-               if (dp->dp_protocol_major) {
-                       rds_ib_set_protocol(conn,
-                               RDS_PROTOCOL(dp->dp_protocol_major,
-                               dp->dp_protocol_minor));
-                       rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+       dp = event->param.conn.private_data;
+       if (conn->c_isv6) {
+               if (event->param.conn.private_data_len >=
+                   sizeof(struct rds6_ib_connect_private)) {
+                       major = dp->ricp_v6.dp_protocol_major;
+                       minor = dp->ricp_v6.dp_protocol_minor;
+                       credit = dp->ricp_v6.dp_credit;
+                       /* dp structure start is not guaranteed to be 8 bytes
+                        * aligned.  Since dp_ack_seq is 64-bit extended load
+                        * operations can be used so go through get_unaligned
+                        * to avoid unaligned errors.
+                        */
+                       ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
                }
+       } else if (event->param.conn.private_data_len >=
+                  sizeof(struct rds_ib_connect_private)) {
+               major = dp->ricp_v4.dp_protocol_major;
+               minor = dp->ricp_v4.dp_protocol_minor;
+               credit = dp->ricp_v4.dp_credit;
+               ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
+       }
+
+       /* make sure it isn't empty data */
+       if (major) {
+               rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
+               rds_ib_set_flow_control(conn, be32_to_cpu(credit));
        }
 
        if (conn->c_version < RDS_PROTOCOL(3, 1)) {
-               pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n",
+               pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
                          &conn->c_laddr, &conn->c_faddr,
                          RDS_PROTOCOL_MAJOR(conn->c_version),
                          RDS_PROTOCOL_MINOR(conn->c_version));
@@ -121,7 +142,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
                rds_conn_destroy(conn);
                return;
        } else {
-               pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
+               pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n",
                          ic->i_active_side ? "Active" : "Passive",
                          &conn->c_laddr, &conn->c_faddr,
                          RDS_PROTOCOL_MAJOR(conn->c_version),
@@ -150,7 +171,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
                printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
 
        /* update ib_device with this local ipaddr */
-       err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
+       err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
        if (err)
                printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
                        err);
@@ -158,14 +179,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
        /* If the peer gave us the last packet it saw, process this as if
         * we had received a regular ACK. */
        if (dp) {
-               /* dp structure start is not guaranteed to be 8 bytes aligned.
-                * Since dp_ack_seq is 64-bit extended load operations can be
-                * used so go through get_unaligned to avoid unaligned errors.
-                */
-               __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
-
-               if (dp_ack_seq)
-                       rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
+               if (ack_seq)
+                       rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
                                            NULL);
        }
 
@@ -173,11 +188,12 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 }
 
 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
-                       struct rdma_conn_param *conn_param,
-                       struct rds_ib_connect_private *dp,
-                       u32 protocol_version,
-                       u32 max_responder_resources,
-                       u32 max_initiator_depth)
+                                     struct rdma_conn_param *conn_param,
+                                     union rds_ib_conn_priv *dp,
+                                     u32 protocol_version,
+                                     u32 max_responder_resources,
+                                     u32 max_initiator_depth,
+                                     bool isv6)
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
        struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
@@ -193,24 +209,49 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 
        if (dp) {
                memset(dp, 0, sizeof(*dp));
-               dp->dp_saddr = conn->c_laddr;
-               dp->dp_daddr = conn->c_faddr;
-               dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
-               dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
-               dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
-               dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic));
+               if (isv6) {
+                       dp->ricp_v6.dp_saddr = conn->c_laddr;
+                       dp->ricp_v6.dp_daddr = conn->c_faddr;
+                       dp->ricp_v6.dp_protocol_major =
+                           RDS_PROTOCOL_MAJOR(protocol_version);
+                       dp->ricp_v6.dp_protocol_minor =
+                           RDS_PROTOCOL_MINOR(protocol_version);
+                       dp->ricp_v6.dp_protocol_minor_mask =
+                           cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+                       dp->ricp_v6.dp_ack_seq =
+                           cpu_to_be64(rds_ib_piggyb_ack(ic));
+
+                       conn_param->private_data = &dp->ricp_v6;
+                       conn_param->private_data_len = sizeof(dp->ricp_v6);
+               } else {
+                       dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3];
+                       dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3];
+                       dp->ricp_v4.dp_protocol_major =
+                           RDS_PROTOCOL_MAJOR(protocol_version);
+                       dp->ricp_v4.dp_protocol_minor =
+                           RDS_PROTOCOL_MINOR(protocol_version);
+                       dp->ricp_v4.dp_protocol_minor_mask =
+                           cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+                       dp->ricp_v4.dp_ack_seq =
+                           cpu_to_be64(rds_ib_piggyb_ack(ic));
+
+                       conn_param->private_data = &dp->ricp_v4;
+                       conn_param->private_data_len = sizeof(dp->ricp_v4);
+               }
 
                /* Advertise flow control */
                if (ic->i_flowctl) {
                        unsigned int credits;
 
-                       credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
-                       dp->dp_credit = cpu_to_be32(credits);
-                       atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+                       credits = IB_GET_POST_CREDITS
+                               (atomic_read(&ic->i_credits));
+                       if (isv6)
+                               dp->ricp_v6.dp_credit = cpu_to_be32(credits);
+                       else
+                               dp->ricp_v4.dp_credit = cpu_to_be32(credits);
+                       atomic_sub(IB_SET_POST_CREDITS(credits),
+                                  &ic->i_credits);
                }
-
-               conn_param->private_data = dp;
-               conn_param->private_data_len = sizeof(*dp);
        }
 }
 
@@ -349,7 +390,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
                break;
        default:
                rdsdebug("Fatal QP Event %u (%s) "
-                       "- connection %pI4->%pI4, reconnecting\n",
+                       "- connection %pI6c->%pI6c, reconnecting\n",
                        event->event, ib_event_msg(event->event),
                        &conn->c_laddr, &conn->c_faddr);
                rds_conn_drop(conn);
@@ -580,11 +621,13 @@ out:
        return ret;
 }
 
-static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
 {
-       const struct rds_ib_connect_private *dp = event->param.conn.private_data;
-       u16 common;
+       const union rds_ib_conn_priv *dp = event->param.conn.private_data;
+       u8 data_len, major, minor;
        u32 version = 0;
+       __be16 mask;
+       u16 common;
 
        /*
         * rdma_cm private data is odd - when there is any private data in the
@@ -603,51 +646,126 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
                return 0;
        }
 
+       if (isv6) {
+               data_len = sizeof(struct rds6_ib_connect_private);
+               major = dp->ricp_v6.dp_protocol_major;
+               minor = dp->ricp_v6.dp_protocol_minor;
+               mask = dp->ricp_v6.dp_protocol_minor_mask;
+       } else {
+               data_len = sizeof(struct rds_ib_connect_private);
+               major = dp->ricp_v4.dp_protocol_major;
+               minor = dp->ricp_v4.dp_protocol_minor;
+               mask = dp->ricp_v4.dp_protocol_minor_mask;
+       }
+
        /* Even if len is crap *now* I still want to check it. -ASG */
-       if (event->param.conn.private_data_len < sizeof (*dp) ||
-           dp->dp_protocol_major == 0)
+       if (event->param.conn.private_data_len < data_len || major == 0)
                return RDS_PROTOCOL_3_0;
 
-       common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
-       if (dp->dp_protocol_major == 3 && common) {
+       common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+       if (major == 3 && common) {
                version = RDS_PROTOCOL_3_0;
                while ((common >>= 1) != 0)
                        version++;
-       } else
-               printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
-                               &dp->dp_saddr,
-                               dp->dp_protocol_major,
-                               dp->dp_protocol_minor);
+       } else {
+               if (isv6)
+                       printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
+                                          &dp->ricp_v6.dp_saddr, major, minor);
+               else
+                       printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
+                                          &dp->ricp_v4.dp_saddr, major, minor);
+       }
        return version;
 }
 
+/* Given an IPv6 address, find the IB net_device which hosts that address and
+ * return its index.  This is used by the rds_ib_cm_handle_connect() code to
+ * find the interface index of where an incoming request comes from when
+ * the request is using a link local address.
+ *
+ * Note one problem in this search.  It is possible that two interfaces have
+ * the same link local address.  Unfortunately, this cannot be solved unless
+ * the underlying layer gives us the interface which an incoming RDMA connect
+ * request comes from.
+ */
+static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
+{
+       struct net_device *dev;
+       int idx = 0;
+
+       rcu_read_lock();
+       for_each_netdev_rcu(net, dev) {
+               if (dev->type == ARPHRD_INFINIBAND &&
+                   ipv6_chk_addr(net, addr, dev, 0)) {
+                       idx = dev->ifindex;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       return idx;
+}
+
 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
-                                   struct rdma_cm_event *event)
+                            struct rdma_cm_event *event, bool isv6)
 {
        __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
        __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
-       const struct rds_ib_connect_private *dp = event->param.conn.private_data;
-       struct rds_ib_connect_private dp_rep;
+       const struct rds_ib_conn_priv_cmn *dp_cmn;
        struct rds_connection *conn = NULL;
        struct rds_ib_connection *ic = NULL;
        struct rdma_conn_param conn_param;
+       const union rds_ib_conn_priv *dp;
+       union rds_ib_conn_priv dp_rep;
+       struct in6_addr s_mapped_addr;
+       struct in6_addr d_mapped_addr;
+       const struct in6_addr *saddr6;
+       const struct in6_addr *daddr6;
+       int destroy = 1;
+       u32 ifindex = 0;
        u32 version;
-       int err = 1, destroy = 1;
+       int err = 1;
 
        /* Check whether the remote protocol version matches ours. */
-       version = rds_ib_protocol_compatible(event);
+       version = rds_ib_protocol_compatible(event, isv6);
        if (!version)
                goto out;
 
-       rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
-                "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
+       dp = event->param.conn.private_data;
+       if (isv6) {
+               dp_cmn = &dp->ricp_v6.dp_cmn;
+               saddr6 = &dp->ricp_v6.dp_saddr;
+               daddr6 = &dp->ricp_v6.dp_daddr;
+               /* If the local address is link local, need to find the
+                * interface index in order to create a proper RDS
+                * connection.
+                */
+               if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) {
+                       /* Using init_net for now ..  */
+                       ifindex = __rds_find_ifindex(&init_net, daddr6);
+                       /* No index found...  Need to bail out. */
+                       if (ifindex == 0) {
+                               err = -EOPNOTSUPP;
+                               goto out;
+                       }
+               }
+       } else {
+               dp_cmn = &dp->ricp_v4.dp_cmn;
+               ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
+               ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
+               saddr6 = &s_mapped_addr;
+               daddr6 = &d_mapped_addr;
+       }
+
+       rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid "
+                "0x%llx\n", saddr6, daddr6,
                 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
                 (unsigned long long)be64_to_cpu(lguid),
                 (unsigned long long)be64_to_cpu(fguid));
 
        /* RDS/IB is not currently netns aware, thus init_net */
-       conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
-                              &rds_ib_transport, GFP_KERNEL);
+       conn = rds_conn_create(&init_net, daddr6, saddr6,
+                              &rds_ib_transport, GFP_KERNEL, ifindex);
        if (IS_ERR(conn)) {
                rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
                conn = NULL;
@@ -678,12 +796,13 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
        ic = conn->c_transport_data;
 
        rds_ib_set_protocol(conn, version);
-       rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+       rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit));
 
        /* If the peer gave us the last packet it saw, process this as if
         * we had received a regular ACK. */
-       if (dp->dp_ack_seq)
-               rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+       if (dp_cmn->ricpc_ack_seq)
+               rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
+                                   NULL);
 
        BUG_ON(cm_id->context);
        BUG_ON(ic->i_cm_id);
@@ -702,8 +821,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
        }
 
        rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
-               event->param.conn.responder_resources,
-               event->param.conn.initiator_depth);
+                                 event->param.conn.responder_resources,
+                                 event->param.conn.initiator_depth, isv6);
 
        /* rdma_accept() calls rdma_reject() internally if it fails */
        if (rdma_accept(cm_id, &conn_param))
@@ -718,12 +837,12 @@ out:
 }
 
 
-int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
 {
        struct rds_connection *conn = cm_id->context;
        struct rds_ib_connection *ic = conn->c_transport_data;
        struct rdma_conn_param conn_param;
-       struct rds_ib_connect_private dp;
+       union rds_ib_conn_priv dp;
        int ret;
 
        /* If the peer doesn't do protocol negotiation, we must
@@ -738,7 +857,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
        }
 
        rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
-               UINT_MAX, UINT_MAX);
+                                 UINT_MAX, UINT_MAX, isv6);
        ret = rdma_connect(cm_id, &conn_param);
        if (ret)
                rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -758,13 +877,17 @@ out:
 int rds_ib_conn_path_connect(struct rds_conn_path *cp)
 {
        struct rds_connection *conn = cp->cp_conn;
-       struct rds_ib_connection *ic = conn->c_transport_data;
-       struct sockaddr_in src, dest;
+       struct sockaddr_storage src, dest;
+       rdma_cm_event_handler handler;
+       struct rds_ib_connection *ic;
        int ret;
 
+       ic = conn->c_transport_data;
+
        /* XXX I wonder what affect the port space has */
        /* delegate cm event handler to rdma_transport */
-       ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
+       handler = rds_rdma_cm_event_handler;
+       ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
                                     RDMA_PS_TCP, IB_QPT_RC);
        if (IS_ERR(ic->i_cm_id)) {
                ret = PTR_ERR(ic->i_cm_id);
@@ -775,13 +898,33 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
 
        rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
 
-       src.sin_family = AF_INET;
-       src.sin_addr.s_addr = (__force u32)conn->c_laddr;
-       src.sin_port = (__force u16)htons(0);
+       if (ipv6_addr_v4mapped(&conn->c_faddr)) {
+               struct sockaddr_in *sin;
+
+               sin = (struct sockaddr_in *)&src;
+               sin->sin_family = AF_INET;
+               sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
+               sin->sin_port = 0;
 
-       dest.sin_family = AF_INET;
-       dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
-       dest.sin_port = (__force u16)htons(RDS_PORT);
+               sin = (struct sockaddr_in *)&dest;
+               sin->sin_family = AF_INET;
+               sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
+               sin->sin_port = htons(RDS_PORT);
+       } else {
+               struct sockaddr_in6 *sin6;
+
+               sin6 = (struct sockaddr_in6 *)&src;
+               sin6->sin6_family = AF_INET6;
+               sin6->sin6_addr = conn->c_laddr;
+               sin6->sin6_port = 0;
+               sin6->sin6_scope_id = conn->c_dev_if;
+
+               sin6 = (struct sockaddr_in6 *)&dest;
+               sin6->sin6_family = AF_INET6;
+               sin6->sin6_addr = conn->c_faddr;
+               sin6->sin6_port = htons(RDS_CM_PORT);
+               sin6->sin6_scope_id = conn->c_dev_if;
+       }
 
        ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
                                (struct sockaddr *)&dest,
index e678699..0ec9df0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -100,18 +100,19 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
                kfree_rcu(to_free, rcu);
 }
 
-int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
+                        struct in6_addr *ipaddr)
 {
        struct rds_ib_device *rds_ibdev_old;
 
-       rds_ibdev_old = rds_ib_get_device(ipaddr);
+       rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]);
        if (!rds_ibdev_old)
-               return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+               return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
 
        if (rds_ibdev_old != rds_ibdev) {
-               rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+               rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]);
                rds_ib_dev_put(rds_ibdev_old);
-               return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+               return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
        }
        rds_ib_dev_put(rds_ibdev_old);
 
@@ -544,7 +545,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
        struct rds_ib_connection *ic = rs->rs_conn->c_transport_data;
        int ret;
 
-       rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
+       rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]);
        if (!rds_ibdev) {
                ret = -ENODEV;
                goto out;
index 1eaf255..557ccbb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -266,7 +266,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
                rds_ib_stats_inc(s_ib_rx_total_incs);
        }
        INIT_LIST_HEAD(&ibinc->ii_frags);
-       rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
+       rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr);
 
        return ibinc;
 }
@@ -418,7 +418,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
                ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
                if (ret) {
                        rds_ib_conn_error(conn, "recv post on "
-                              "%pI4 returned %d, disconnecting and "
+                              "%pI6c returned %d, disconnecting and "
                               "reconnecting\n", &conn->c_faddr,
                               ret);
                        break;
@@ -848,7 +848,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 
        if (data_len < sizeof(struct rds_header)) {
                rds_ib_conn_error(conn, "incoming message "
-                      "from %pI4 didn't include a "
+                      "from %pI6c didn't include a "
                       "header, disconnecting and "
                       "reconnecting\n",
                       &conn->c_faddr);
@@ -861,7 +861,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
        /* Validate the checksum. */
        if (!rds_message_verify_checksum(ihdr)) {
                rds_ib_conn_error(conn, "incoming message "
-                      "from %pI4 has corrupted header - "
+                      "from %pI6c has corrupted header - "
                       "forcing a reconnect\n",
                       &conn->c_faddr);
                rds_stats_inc(s_recv_drop_bad_checksum);
@@ -941,10 +941,10 @@ static void rds_ib_process_recv(struct rds_connection *conn,
                ic->i_recv_data_rem = 0;
                ic->i_ibinc = NULL;
 
-               if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+               if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) {
                        rds_ib_cong_recv(conn, ibinc);
-               else {
-                       rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+               else {
+                       rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr,
                                          &ibinc->ii_inc, GFP_ATOMIC);
                        state->ack_next = be64_to_cpu(hdr->h_sequence);
                        state->ack_next_valid = 1;
@@ -988,7 +988,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
        } else {
                /* We expect errors as the qp is drained during shutdown */
                if (rds_conn_up(conn) || rds_conn_connecting(conn))
-                       rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+                       rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
                                          &conn->c_laddr, &conn->c_faddr,
                                          wc->status,
                                          ib_wc_status_msg(wc->status));
index 8557a1c..c4cdfe4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -305,7 +305,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 
        /* We expect errors as the qp is drained during shutdown */
        if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-               rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+               rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
                                  &conn->c_laddr, &conn->c_faddr, wc->status,
                                  ib_wc_status_msg(wc->status));
        }
@@ -730,7 +730,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                 first, &first->s_wr, ret, failed_wr);
        BUG_ON(failed_wr != &first->s_wr);
        if (ret) {
-               printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
+               printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c "
                       "returned %d\n", &conn->c_faddr, ret);
                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
                rds_ib_sub_signaled(ic, nr_sig);
@@ -827,7 +827,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
                 send, &send->s_atomic_wr, ret, failed_wr);
        BUG_ON(failed_wr != &send->s_atomic_wr.wr);
        if (ret) {
-               printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+               printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c "
                       "returned %d\n", &conn->c_faddr, ret);
                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
                rds_ib_sub_signaled(ic, nr_sig);
@@ -967,7 +967,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                 first, &first->s_rdma_wr.wr, ret, failed_wr);
        BUG_ON(failed_wr != &first->s_rdma_wr.wr);
        if (ret) {
-               printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
+               printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c "
                       "returned %d\n", &conn->c_faddr, ret);
                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
                rds_ib_sub_signaled(ic, nr_sig);
index feea1f9..1d73ad7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
 #include <linux/in.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <linux/ipv6.h>
 
 #include "rds_single_path.h"
 #include "rds.h"
@@ -88,11 +89,11 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
 
        BUG_ON(hdr_off || sg || off);
 
-       rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
+       rds_inc_init(&rm->m_inc, conn, &conn->c_laddr);
        /* For the embedded inc. Matching put is in loop_inc_free() */
        rds_message_addref(rm);
 
-       rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+       rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc,
                          GFP_KERNEL);
 
        rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
index 634cfcb..7b39980 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Oracle.  All rights reserved.
+ * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -183,7 +183,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
        long i;
        int ret;
 
-       if (rs->rs_bound_addr == 0 || !rs->rs_transport) {
+       if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) {
                ret = -ENOTCONN; /* XXX not a great errno */
                goto out;
        }
@@ -574,7 +574,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 
        args = CMSG_DATA(cmsg);
 
-       if (rs->rs_bound_addr == 0) {
+       if (ipv6_addr_any(&rs->rs_bound_addr)) {
                ret = -ENOTCONN; /* XXX not a great errno */
                goto out_ret;
        }
index fc59821..f49abef 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009 Oracle.  All rights reserved.
+ * Copyright (c) 2009, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -39,8 +39,9 @@
 
 static struct rdma_cm_id *rds_rdma_listen_id;
 
-int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
-                             struct rdma_cm_event *event)
+static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
+                                        struct rdma_cm_event *event,
+                                        bool isv6)
 {
        /* this can be null in the listening path */
        struct rds_connection *conn = cm_id->context;
@@ -72,7 +73,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
        switch (event->event) {
        case RDMA_CM_EVENT_CONNECT_REQUEST:
-               ret = trans->cm_handle_connect(cm_id, event);
+               ret = trans->cm_handle_connect(cm_id, event, isv6);
                break;
 
        case RDMA_CM_EVENT_ADDR_RESOLVED:
@@ -90,7 +91,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
                        ibic = conn->c_transport_data;
                        if (ibic && ibic->i_cm_id == cm_id)
-                               ret = trans->cm_initiate_connect(cm_id);
+                               ret = trans->cm_initiate_connect(cm_id, isv6);
                        else
                                rds_conn_drop(conn);
                }
@@ -116,14 +117,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
        case RDMA_CM_EVENT_DISCONNECTED:
                rdsdebug("DISCONNECT event - dropping connection "
-                       "%pI4->%pI4\n", &conn->c_laddr,
+                        "%pI6c->%pI6c\n", &conn->c_laddr,
                         &conn->c_faddr);
                rds_conn_drop(conn);
                break;
 
        case RDMA_CM_EVENT_TIMEWAIT_EXIT:
                if (conn) {
-                       pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n",
+                       pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI6c->%pI6c\n",
                                &conn->c_laddr, &conn->c_faddr);
                        rds_conn_drop(conn);
                }
@@ -146,13 +147,20 @@ out:
        return ret;
 }
 
-static int rds_rdma_listen_init(void)
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+                             struct rdma_cm_event *event)
+{
+       return rds_rdma_cm_event_handler_cmn(cm_id, event, false);
+}
+
+static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
+                                      struct sockaddr *sa,
+                                      struct rdma_cm_id **ret_cm_id)
 {
-       struct sockaddr_in sin;
        struct rdma_cm_id *cm_id;
        int ret;
 
-       cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL,
+       cm_id = rdma_create_id(&init_net, handler, NULL,
                               RDMA_PS_TCP, IB_QPT_RC);
        if (IS_ERR(cm_id)) {
                ret = PTR_ERR(cm_id);
@@ -161,15 +169,11 @@ static int rds_rdma_listen_init(void)
                return ret;
        }
 
-       sin.sin_family = AF_INET;
-       sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
-       sin.sin_port = (__force u16)htons(RDS_PORT);
-
        /*
         * XXX I bet this binds the cm_id to a device.  If we want to support
         * fail-over we'll have to take this into consideration.
         */
-       ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+       ret = rdma_bind_addr(cm_id, sa);
        if (ret) {
                printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
                       "rdma_bind_addr() returned %d\n", ret);
@@ -185,7 +189,7 @@ static int rds_rdma_listen_init(void)
 
        rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
 
-       rds_rdma_listen_id = cm_id;
+       *ret_cm_id = cm_id;
        cm_id = NULL;
 out:
        if (cm_id)
@@ -193,6 +197,26 @@ out:
        return ret;
 }
 
+/* Initialize the RDS RDMA listeners.  We create two listeners for
+ * compatibility reason.  The one on RDS_PORT is used for IPv4
+ * requests only.  The one on RDS_CM_PORT is used for IPv6 requests
+ * only.  So only IPv6 enabled RDS module will communicate using this
+ * port.
+ */
+static int rds_rdma_listen_init(void)
+{
+       int ret;
+       struct sockaddr_in sin;
+
+       sin.sin_family = PF_INET;
+       sin.sin_addr.s_addr = htonl(INADDR_ANY);
+       sin.sin_port = htons(RDS_PORT);
+       ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler,
+                                         (struct sockaddr *)&sin,
+                                         &rds_rdma_listen_id);
+       return ret;
+}
+
 static void rds_rdma_listen_stop(void)
 {
        if (rds_rdma_listen_id) {
index f2272fb..1bff269 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/rds.h>
 #include <linux/rhashtable.h>
 #include <linux/refcount.h>
+#include <linux/in6.h>
 
 #include "info.h"
 
@@ -30,6 +31,7 @@
  * userspace from listening.
  */
 #define RDS_PORT       18634
+#define RDS_CM_PORT    16385
 
 #ifdef ATOMIC64_INIT
 #define KERNEL_HAS_ATOMIC64
@@ -61,7 +63,7 @@ void rdsdebug(char *fmt, ...)
 
 struct rds_cong_map {
        struct rb_node          m_rb_node;
-       __be32                  m_addr;
+       struct in6_addr         m_addr;
        wait_queue_head_t       m_waitq;
        struct list_head        m_conn_list;
        unsigned long           m_page_addrs[RDS_CONG_MAP_PAGES];
@@ -136,11 +138,13 @@ struct rds_conn_path {
 /* One rds_connection per RDS address pair */
 struct rds_connection {
        struct hlist_node       c_hash_node;
-       __be32                  c_laddr;
-       __be32                  c_faddr;
+       struct in6_addr         c_laddr;
+       struct in6_addr         c_faddr;
+       int                     c_dev_if; /* c_laddrs's interface index */
        unsigned int            c_loopback:1,
+                               c_isv6:1,
                                c_ping_triggered:1,
-                               c_pad_to_32:30;
+                               c_pad_to_32:29;
        int                     c_npaths;
        struct rds_connection   *c_passive;
        struct rds_transport    *c_trans;
@@ -269,7 +273,7 @@ struct rds_incoming {
        struct rds_conn_path    *i_conn_path;
        struct rds_header       i_hdr;
        unsigned long           i_rx_jiffies;
-       __be32                  i_saddr;
+       struct in6_addr         i_saddr;
 
        rds_rdma_cookie_t       i_rdma_cookie;
        struct timeval          i_rx_tstamp;
@@ -386,7 +390,7 @@ struct rds_message {
        struct list_head        m_conn_item;
        struct rds_incoming     m_inc;
        u64                     m_ack_seq;
-       __be32                  m_daddr;
+       struct in6_addr         m_daddr;
        unsigned long           m_flags;
 
        /* Never access m_rs without holding m_rs_lock.
@@ -519,7 +523,8 @@ struct rds_transport {
                                t_mp_capable:1;
        unsigned int            t_type;
 
-       int (*laddr_check)(struct net *net, __be32 addr);
+       int (*laddr_check)(struct net *net, const struct in6_addr *addr,
+                          __u32 scope_id);
        int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
        void (*conn_free)(void *data);
        int (*conn_path_connect)(struct rds_conn_path *cp);
@@ -535,8 +540,8 @@ struct rds_transport {
        void (*inc_free)(struct rds_incoming *inc);
 
        int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
-                                struct rdma_cm_event *event);
-       int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+                                struct rdma_cm_event *event, bool isv6);
+       int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6);
        void (*cm_connect_complete)(struct rds_connection *conn,
                                    struct rdma_cm_event *event);
 
@@ -551,6 +556,12 @@ struct rds_transport {
        bool (*t_unloading)(struct rds_connection *conn);
 };
 
+/* Bind hash table key length.  It is the sum of the size of a struct
+ * in6_addr, a scope_id  and a port.
+ */
+#define RDS_BOUND_KEY_LEN \
+       (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16))
+
 struct rds_sock {
        struct sock             rs_sk;
 
@@ -562,10 +573,14 @@ struct rds_sock {
         * support.
         */
        struct rhash_head       rs_bound_node;
-       u64                     rs_bound_key;
-       __be32                  rs_bound_addr;
-       __be32                  rs_conn_addr;
-       __be16                  rs_bound_port;
+       u8                      rs_bound_key[RDS_BOUND_KEY_LEN];
+       struct sockaddr_in6     rs_bound_sin6;
+#define rs_bound_addr          rs_bound_sin6.sin6_addr
+#define rs_bound_addr_v4       rs_bound_sin6.sin6_addr.s6_addr32[3]
+#define rs_bound_port          rs_bound_sin6.sin6_port
+#define rs_bound_scope_id      rs_bound_sin6.sin6_scope_id
+       struct in6_addr         rs_conn_addr;
+#define rs_conn_addr_v4                rs_conn_addr.s6_addr32[3]
        __be16                  rs_conn_port;
        struct rds_transport    *rs_transport;
 
@@ -701,7 +716,8 @@ extern wait_queue_head_t rds_poll_waitq;
 /* bind.c */
 int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
 void rds_remove_bound(struct rds_sock *rs);
-struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
+                               __u32 scope_id);
 int rds_bind_lock_init(void);
 void rds_bind_lock_destroy(void);
 
@@ -725,11 +741,15 @@ extern u32 rds_gen_num;
 int rds_conn_init(void);
 void rds_conn_exit(void);
 struct rds_connection *rds_conn_create(struct net *net,
-                                      __be32 laddr, __be32 faddr,
-                                      struct rds_transport *trans, gfp_t gfp);
+                                      const struct in6_addr *laddr,
+                                      const struct in6_addr *faddr,
+                                      struct rds_transport *trans, gfp_t gfp,
+                                      int dev_if);
 struct rds_connection *rds_conn_create_outgoing(struct net *net,
-                                               __be32 laddr, __be32 faddr,
-                              struct rds_transport *trans, gfp_t gfp);
+                                               const struct in6_addr *laddr,
+                                               const struct in6_addr *faddr,
+                                               struct rds_transport *trans,
+                                               gfp_t gfp, int dev_if);
 void rds_conn_shutdown(struct rds_conn_path *cpath);
 void rds_conn_destroy(struct rds_connection *conn);
 void rds_conn_drop(struct rds_connection *conn);
@@ -840,11 +860,12 @@ void rds_page_exit(void);
 
 /* recv.c */
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
-                 __be32 saddr);
+                 struct in6_addr *saddr);
 void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
-                      __be32 saddr);
+                      struct in6_addr *saddr);
 void rds_inc_put(struct rds_incoming *inc);
-void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
+                      struct in6_addr *daddr,
                       struct rds_incoming *inc, gfp_t gfp);
 int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                int msg_flags);
@@ -859,7 +880,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
 void rds_send_path_reset(struct rds_conn_path *conn);
 int rds_send_xmit(struct rds_conn_path *cp);
 struct sockaddr_in;
-void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest);
 typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
 void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
                         is_acked_func is_acked);
@@ -946,11 +967,14 @@ void rds_send_worker(struct work_struct *);
 void rds_recv_worker(struct work_struct *);
 void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
 void rds_connect_complete(struct rds_connection *conn);
+int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2);
 
 /* transport.c */
 void rds_trans_register(struct rds_transport *trans);
 void rds_trans_unregister(struct rds_transport *trans);
-struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
+struct rds_transport *rds_trans_get_preferred(struct net *net,
+                                             const struct in6_addr *addr,
+                                             __u32 scope_id);
 void rds_trans_put(struct rds_transport *trans);
 unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
                                       unsigned int avail);
index 192ac6f..4217961 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
 #include "rds.h"
 
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
-                 __be32 saddr)
+                struct in6_addr *saddr)
 {
        int i;
 
        refcount_set(&inc->i_refcount, 1);
        INIT_LIST_HEAD(&inc->i_item);
        inc->i_conn = conn;
-       inc->i_saddr = saddr;
+       inc->i_saddr = *saddr;
        inc->i_rdma_cookie = 0;
        inc->i_rx_tstamp.tv_sec = 0;
        inc->i_rx_tstamp.tv_usec = 0;
@@ -59,13 +59,13 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 EXPORT_SYMBOL_GPL(rds_inc_init);
 
 void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
-                      __be32 saddr)
+                      struct in6_addr  *saddr)
 {
        refcount_set(&inc->i_refcount, 1);
        INIT_LIST_HEAD(&inc->i_item);
        inc->i_conn = cp->cp_conn;
        inc->i_conn_path = cp;
-       inc->i_saddr = saddr;
+       inc->i_saddr = *saddr;
        inc->i_rdma_cookie = 0;
        inc->i_rx_tstamp.tv_sec = 0;
        inc->i_rx_tstamp.tv_usec = 0;
@@ -110,7 +110,7 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
 
        now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
 
-       rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+       rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d "
          "now_cong %d delta %d\n",
          rs, &rs->rs_bound_addr,
          ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
@@ -260,7 +260,7 @@ static void rds_start_mprds(struct rds_connection *conn)
        struct rds_conn_path *cp;
 
        if (conn->c_npaths > 1 &&
-           IS_CANONICAL(conn->c_laddr, conn->c_faddr)) {
+           rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) {
                for (i = 0; i < conn->c_npaths; i++) {
                        cp = &conn->c_path[i];
                        rds_conn_path_connect_if_down(cp);
@@ -284,7 +284,8 @@ static void rds_start_mprds(struct rds_connection *conn)
  * conn.  This lets loopback, who only has one conn for both directions,
  * tell us which roles the addrs in the conn are playing for this message.
  */
-void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
+                      struct in6_addr *daddr,
                       struct rds_incoming *inc, gfp_t gfp)
 {
        struct rds_sock *rs = NULL;
@@ -339,7 +340,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 
        if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
                if (inc->i_hdr.h_sport == 0) {
-                       rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr);
+                       rdsdebug("ignore ping with 0 sport from %pI6c\n",
+                                saddr);
                        goto out;
                }
                rds_stats_inc(s_recv_ping);
@@ -362,7 +364,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
                goto out;
        }
 
-       rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+       rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_dev_if);
        if (!rs) {
                rds_stats_inc(s_recv_drop_no_sock);
                goto out;
@@ -625,6 +627,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
        struct rds_sock *rs = rds_sk_to_rs(sk);
        long timeo;
        int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+       DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
        DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
        struct rds_incoming *inc = NULL;
 
@@ -673,7 +676,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        break;
                }
 
-               rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+               rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
                         &inc->i_conn->c_faddr,
                         ntohs(inc->i_hdr.h_sport));
                ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
@@ -707,12 +710,26 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 
                rds_stats_inc(s_recv_delivered);
 
-               if (sin) {
-                       sin->sin_family = AF_INET;
-                       sin->sin_port = inc->i_hdr.h_sport;
-                       sin->sin_addr.s_addr = inc->i_saddr;
-                       memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
-                       msg->msg_namelen = sizeof(*sin);
+               if (msg->msg_name) {
+                       if (ipv6_addr_v4mapped(&inc->i_saddr)) {
+                               sin = (struct sockaddr_in *)msg->msg_name;
+
+                               sin->sin_family = AF_INET;
+                               sin->sin_port = inc->i_hdr.h_sport;
+                               sin->sin_addr.s_addr =
+                                   inc->i_saddr.s6_addr32[3];
+                               memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+                               msg->msg_namelen = sizeof(*sin);
+                       } else {
+                               sin6 = (struct sockaddr_in6 *)msg->msg_name;
+
+                               sin6->sin6_family = AF_INET6;
+                               sin6->sin6_port = inc->i_hdr.h_sport;
+                               sin6->sin6_addr = inc->i_saddr;
+                               sin6->sin6_flowinfo = 0;
+                               sin6->sin6_scope_id = rs->rs_bound_scope_id;
+                               msg->msg_namelen = sizeof(*sin6);
+                       }
                }
                break;
        }
index 94c7f74..6ed2e92 100644 (file)
@@ -709,7 +709,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
 }
 EXPORT_SYMBOL_GPL(rds_send_drop_acked);
 
-void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest)
 {
        struct rds_message *rm, *tmp;
        struct rds_connection *conn;
@@ -721,8 +721,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
        spin_lock_irqsave(&rs->rs_lock, flags);
 
        list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
-               if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
-                            dest->sin_port != rm->m_inc.i_hdr.h_dport))
+               if (dest &&
+                   (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) ||
+                    dest->sin6_port != rm->m_inc.i_hdr.h_dport))
                        continue;
 
                list_move(&rm->m_sock_item, &list);
@@ -1059,8 +1060,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 {
        struct sock *sk = sock->sk;
        struct rds_sock *rs = rds_sk_to_rs(sk);
+       DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
        DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
-       __be32 daddr;
        __be16 dport;
        struct rds_message *rm = NULL;
        struct rds_connection *conn;
@@ -1069,10 +1070,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
        int nonblock = msg->msg_flags & MSG_DONTWAIT;
        long timeo = sock_sndtimeo(sk, nonblock);
        struct rds_conn_path *cpath;
+       struct in6_addr daddr;
+       __u32 scope_id = 0;
        size_t total_payload_len = payload_len, rdma_payload_len = 0;
        bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
                      sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
        int num_sgs = ceil(payload_len, PAGE_SIZE);
+       int namelen;
 
        /* Mirror Linux UDP mirror of BSD error message compatibility */
        /* XXX: Perhaps MSG_MORE someday */
@@ -1081,27 +1085,59 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
                goto out;
        }
 
-       if (msg->msg_namelen) {
-               /* XXX fail non-unicast destination IPs? */
-               if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
+       namelen = msg->msg_namelen;
+       if (namelen != 0) {
+               if (namelen < sizeof(*usin)) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               switch (namelen) {
+               case sizeof(*usin):
+                       if (usin->sin_family != AF_INET ||
+                           usin->sin_addr.s_addr == htonl(INADDR_ANY) ||
+                           usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
+                           IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr);
+                       dport = usin->sin_port;
+                       break;
+
+               case sizeof(*sin6): {
+                       ret = -EPROTONOSUPPORT;
+                       goto out;
+               }
+
+               default:
                        ret = -EINVAL;
                        goto out;
                }
-               daddr = usin->sin_addr.s_addr;
-               dport = usin->sin_port;
        } else {
                /* We only care about consistency with ->connect() */
                lock_sock(sk);
                daddr = rs->rs_conn_addr;
                dport = rs->rs_conn_port;
+               scope_id = rs->rs_bound_scope_id;
                release_sock(sk);
        }
 
        lock_sock(sk);
-       if (daddr == 0 || rs->rs_bound_addr == 0) {
+       if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) {
                release_sock(sk);
-               ret = -ENOTCONN; /* XXX not a great errno */
+               ret = -ENOTCONN;
                goto out;
+       } else if (namelen != 0) {
+               /* Cannot send to an IPv4 address using an IPv6 source
+                * address and cannot send to an IPv6 address using an
+                * IPv4 source address.
+                */
+               if (ipv6_addr_v4mapped(&daddr) ^
+                   ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
+                       release_sock(sk);
+                       ret = -EOPNOTSUPP;
+                       goto out;
+               }
        }
        release_sock(sk);
 
@@ -1155,13 +1191,14 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 
        /* rds_conn_create has a spinlock that runs with IRQ off.
         * Caching the conn in the socket helps a lot. */
-       if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
+       if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr))
                conn = rs->rs_conn;
        else {
                conn = rds_conn_create_outgoing(sock_net(sock->sk),
-                                               rs->rs_bound_addr, daddr,
-                                       rs->rs_transport,
-                                       sock->sk->sk_allocation);
+                                               &rs->rs_bound_addr, &daddr,
+                                               rs->rs_transport,
+                                               sock->sk->sk_allocation,
+                                               scope_id);
                if (IS_ERR(conn)) {
                        ret = PTR_ERR(conn);
                        goto out;
index 351a284..dadb337 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -37,6 +37,8 @@
 #include <net/tcp.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/tcp.h>
+#include <net/addrconf.h>
 
 #include "rds.h"
 #include "tcp.h"
@@ -262,9 +264,33 @@ out:
        spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
 }
 
-static int rds_tcp_laddr_check(struct net *net, __be32 addr)
+static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
+                              __u32 scope_id)
 {
-       if (inet_addr_type(net, addr) == RTN_LOCAL)
+       struct net_device *dev = NULL;
+       int ret;
+
+       if (ipv6_addr_v4mapped(addr)) {
+               if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL)
+                       return 0;
+               return -EADDRNOTAVAIL;
+       }
+
+       /* If the scope_id is specified, check only those addresses
+        * hosted on the specified interface.
+        */
+       if (scope_id != 0) {
+               rcu_read_lock();
+               dev = dev_get_by_index_rcu(net, scope_id);
+               /* scope_id is not valid... */
+               if (!dev) {
+                       rcu_read_unlock();
+                       return -EADDRNOTAVAIL;
+               }
+               rcu_read_unlock();
+       }
+       ret = ipv6_chk_addr(net, addr, dev, 0);
+       if (ret)
                return 0;
        return -EADDRNOTAVAIL;
 }
index d999e70..231ae92 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -66,7 +66,8 @@ void rds_tcp_state_change(struct sock *sk)
                 * RDS connection as RDS_CONN_UP until the reconnect,
                 * to avoid RDS datagram loss.
                 */
-               if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) &&
+               if (rds_addr_cmp(&cp->cp_conn->c_laddr,
+                                &cp->cp_conn->c_faddr) >= 0 &&
                    rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
                                             RDS_CONN_ERROR)) {
                        rds_conn_path_drop(cp, false);
@@ -88,7 +89,9 @@ out:
 int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 {
        struct socket *sock = NULL;
-       struct sockaddr_in src, dest;
+       struct sockaddr_in sin;
+       struct sockaddr *addr;
+       int addrlen;
        int ret;
        struct rds_connection *conn = cp->cp_conn;
        struct rds_tcp_connection *tc = cp->cp_transport_data;
@@ -112,30 +115,33 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 
        rds_tcp_tune(sock);
 
-       src.sin_family = AF_INET;
-       src.sin_addr.s_addr = (__force u32)conn->c_laddr;
-       src.sin_port = (__force u16)htons(0);
+       sin.sin_family = AF_INET;
+       sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
+       sin.sin_port = 0;
+       addr = (struct sockaddr *)&sin;
+       addrlen = sizeof(sin);
 
-       ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
+       ret = sock->ops->bind(sock, addr, addrlen);
        if (ret) {
-               rdsdebug("bind failed with %d at address %pI4\n",
+               rdsdebug("bind failed with %d at address %pI6c\n",
                         ret, &conn->c_laddr);
                goto out;
        }
 
-       dest.sin_family = AF_INET;
-       dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
-       dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
+       sin.sin_family = AF_INET;
+       sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
+       sin.sin_port = htons(RDS_TCP_PORT);
+       addr = (struct sockaddr *)&sin;
+       addrlen = sizeof(sin);
 
        /*
         * once we call connect() we can start getting callbacks and they
         * own the socket
         */
        rds_tcp_set_callbacks(sock, cp);
-       ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
-                                O_NONBLOCK);
+       ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK);
 
-       rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
+       rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret);
        if (ret == -EINPROGRESS)
                ret = 0;
        if (ret == 0) {
index 2257118..4fdf5b3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2018 Oracle All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -83,13 +83,12 @@ static
 struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
 {
        int i;
-       bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr);
        int npaths = max_t(int, 1, conn->c_npaths);
 
        /* for mprds, all paths MUST be initiated by the peer
         * with the smaller address.
         */
-       if (!peer_is_smaller) {
+       if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) {
                /* Make sure we initiate at least one path if this
                 * has not already been done; rds_start_mprds() will
                 * take care of additional paths, if necessary.
@@ -164,13 +163,16 @@ int rds_tcp_accept_one(struct socket *sock)
 
        inet = inet_sk(new_sock->sk);
 
-       rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
-                &inet->inet_saddr, ntohs(inet->inet_sport),
-                &inet->inet_daddr, ntohs(inet->inet_dport));
+       rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n",
+                &new_sock->sk->sk_v6_rcv_saddr, ntohs(inet->inet_sport),
+                &new_sock->sk->sk_v6_daddr, ntohs(inet->inet_dport));
 
        conn = rds_conn_create(sock_net(sock->sk),
-                              inet->inet_saddr, inet->inet_daddr,
-                              &rds_tcp_transport, GFP_KERNEL);
+                              &new_sock->sk->sk_v6_rcv_saddr,
+                              &new_sock->sk->sk_v6_daddr,
+                              &rds_tcp_transport, GFP_KERNEL,
+                              new_sock->sk->sk_bound_dev_if);
+
        if (IS_ERR(conn)) {
                ret = PTR_ERR(conn);
                goto out;
index b9fbd2e..42c5ff1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -179,7 +179,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
                        tc->t_tinc = tinc;
                        rdsdebug("alloced tinc %p\n", tinc);
                        rds_inc_path_init(&tinc->ti_inc, cp,
-                                         cp->cp_conn->c_faddr);
+                                         &cp->cp_conn->c_faddr);
                        tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
                                        local_clock();
 
@@ -239,8 +239,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
                        if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
                                rds_tcp_cong_recv(conn, tinc);
                        else
-                               rds_recv_incoming(conn, conn->c_faddr,
-                                                 conn->c_laddr, &tinc->ti_inc,
+                               rds_recv_incoming(conn, &conn->c_faddr,
+                                                 &conn->c_laddr,
+                                                 &tinc->ti_inc,
                                                  arg->gfp);
 
                        tc->t_tinc_hdr_rem = sizeof(struct rds_header);
index 7df869d..78a2554 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -153,7 +153,7 @@ out:
                         * an incoming RST.
                         */
                        if (rds_conn_path_up(cp)) {
-                               pr_warn("RDS/tcp: send to %pI4 on cp [%d]"
+                               pr_warn("RDS/tcp: send to %pI6c on cp [%d]"
                                        "returned %d, "
                                        "disconnecting and reconnecting\n",
                                        &conn->c_faddr, cp->cp_index, ret);
index c52861d..e64f9e4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -82,8 +82,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
                return;
        }
 
-       rdsdebug("conn %p for %pI4 to %pI4 complete\n",
-         cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
+       rdsdebug("conn %p for %pI6c to %pI6c complete\n",
+                cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
 
        cp->cp_reconnect_jiffies = 0;
        set_bit(0, &cp->cp_conn->c_map_queued);
@@ -125,13 +125,13 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
        unsigned long rand;
        struct rds_connection *conn = cp->cp_conn;
 
-       rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
-         conn, &conn->c_laddr, &conn->c_faddr,
-         cp->cp_reconnect_jiffies);
+       rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n",
+                conn, &conn->c_laddr, &conn->c_faddr,
+                cp->cp_reconnect_jiffies);
 
        /* let peer with smaller addr initiate reconnect, to avoid duels */
        if (conn->c_trans->t_type == RDS_TRANS_TCP &&
-           !IS_CANONICAL(conn->c_laddr, conn->c_faddr))
+           rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0)
                return;
 
        set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
@@ -145,7 +145,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
        }
 
        get_random_bytes(&rand, sizeof(rand));
-       rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
+       rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n",
                 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
                 conn, &conn->c_laddr, &conn->c_faddr);
        rcu_read_lock();
@@ -167,14 +167,14 @@ void rds_connect_worker(struct work_struct *work)
        int ret;
 
        if (cp->cp_index > 0 &&
-           !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr))
+           rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0)
                return;
        clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
        ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
        if (ret) {
                ret = conn->c_trans->conn_path_connect(cp);
-               rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
-                       conn, &conn->c_laddr, &conn->c_faddr, ret);
+               rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n",
+                        conn, &conn->c_laddr, &conn->c_faddr, ret);
 
                if (ret) {
                        if (rds_conn_path_transition(cp,
@@ -259,3 +259,50 @@ int rds_threads_init(void)
 
        return 0;
 }
+
+/* Compare two IPv6 addresses.  Return 0 if the two addresses are equal.
+ * Return 1 if the first is greater.  Return -1 if the second is greater.
+ */
+int rds_addr_cmp(const struct in6_addr *addr1,
+                const struct in6_addr *addr2)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+       const __be64 *a1, *a2;
+       u64 x, y;
+
+       a1 = (__be64 *)addr1;
+       a2 = (__be64 *)addr2;
+
+       if (*a1 != *a2) {
+               if (be64_to_cpu(*a1) < be64_to_cpu(*a2))
+                       return -1;
+               else
+                       return 1;
+       } else {
+               x = be64_to_cpu(*++a1);
+               y = be64_to_cpu(*++a2);
+               if (x < y)
+                       return -1;
+               else if (x > y)
+                       return 1;
+               else
+                       return 0;
+       }
+#else
+       u32 a, b;
+       int i;
+
+       for (i = 0; i < 4; i++) {
+               if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) {
+                       a = ntohl(addr1->s6_addr32[i]);
+                       b = ntohl(addr2->s6_addr32[i]);
+                       if (a < b)
+                               return -1;
+                       else if (a > b)
+                               return 1;
+               }
+       }
+       return 0;
+#endif
+}
+EXPORT_SYMBOL_GPL(rds_addr_cmp);
index 0b188dd..c9788db 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/in.h>
+#include <linux/ipv6.h>
 
 #include "rds.h"
 #include "loop.h"
@@ -75,20 +76,26 @@ void rds_trans_put(struct rds_transport *trans)
                module_put(trans->t_owner);
 }
 
-struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr)
+struct rds_transport *rds_trans_get_preferred(struct net *net,
+                                             const struct in6_addr *addr,
+                                             __u32 scope_id)
 {
        struct rds_transport *ret = NULL;
        struct rds_transport *trans;
        unsigned int i;
 
-       if (IN_LOOPBACK(ntohl(addr)))
+       if (ipv6_addr_v4mapped(addr)) {
+               if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET)
+                       return &rds_loop_transport;
+       } else if (ipv6_addr_loopback(addr)) {
                return &rds_loop_transport;
+       }
 
        down_read(&rds_trans_sem);
        for (i = 0; i < RDS_TRANS_COUNT; i++) {
                trans = transports[i];
 
-               if (trans && (trans->laddr_check(net, addr) == 0) &&
+               if (trans && (trans->laddr_check(net, addr, scope_id) == 0) &&
                    (!trans->t_owner || try_module_get(trans->t_owner))) {
                        ret = trans;
                        break;