Merge branch 'for-4.18-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj...
[linux-2.6-microblaze.git] / drivers / block / nbd.c
1 /*
2  * Network block device - make block devices work over TCP
3  *
4  * Note that you can not swap over this thing, yet. Seems to work but
5  * deadlocks sometimes - you can not swap over TCP in general.
6  * 
7  * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
8  * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
9  *
10  * This file is released under GPLv2 or later.
11  *
12  * (part of code stolen from loop.c)
13  */
14
15 #include <linux/major.h>
16
17 #include <linux/blkdev.h>
18 #include <linux/module.h>
19 #include <linux/init.h>
20 #include <linux/sched.h>
21 #include <linux/sched/mm.h>
22 #include <linux/fs.h>
23 #include <linux/bio.h>
24 #include <linux/stat.h>
25 #include <linux/errno.h>
26 #include <linux/file.h>
27 #include <linux/ioctl.h>
28 #include <linux/mutex.h>
29 #include <linux/compiler.h>
30 #include <linux/err.h>
31 #include <linux/kernel.h>
32 #include <linux/slab.h>
33 #include <net/sock.h>
34 #include <linux/net.h>
35 #include <linux/kthread.h>
36 #include <linux/types.h>
37 #include <linux/debugfs.h>
38 #include <linux/blk-mq.h>
39
40 #include <linux/uaccess.h>
41 #include <asm/types.h>
42
43 #include <linux/nbd.h>
44 #include <linux/nbd-netlink.h>
45 #include <net/genetlink.h>
46
47 static DEFINE_IDR(nbd_index_idr);
48 static DEFINE_MUTEX(nbd_index_mutex);
49 static int nbd_total_devices = 0;
50
51 struct nbd_sock {
52         struct socket *sock;
53         struct mutex tx_lock;
54         struct request *pending;
55         int sent;
56         bool dead;
57         int fallback_index;
58         int cookie;
59 };
60
61 struct recv_thread_args {
62         struct work_struct work;
63         struct nbd_device *nbd;
64         int index;
65 };
66
67 struct link_dead_args {
68         struct work_struct work;
69         int index;
70 };
71
72 #define NBD_TIMEDOUT                    0
73 #define NBD_DISCONNECT_REQUESTED        1
74 #define NBD_DISCONNECTED                2
75 #define NBD_HAS_PID_FILE                3
76 #define NBD_HAS_CONFIG_REF              4
77 #define NBD_BOUND                       5
78 #define NBD_DESTROY_ON_DISCONNECT       6
79 #define NBD_DISCONNECT_ON_CLOSE         7
80
81 struct nbd_config {
82         u32 flags;
83         unsigned long runtime_flags;
84         u64 dead_conn_timeout;
85
86         struct nbd_sock **socks;
87         int num_connections;
88         atomic_t live_connections;
89         wait_queue_head_t conn_wait;
90
91         atomic_t recv_threads;
92         wait_queue_head_t recv_wq;
93         loff_t blksize;
94         loff_t bytesize;
95 #if IS_ENABLED(CONFIG_DEBUG_FS)
96         struct dentry *dbg_dir;
97 #endif
98 };
99
100 struct nbd_device {
101         struct blk_mq_tag_set tag_set;
102
103         int index;
104         refcount_t config_refs;
105         refcount_t refs;
106         struct nbd_config *config;
107         struct mutex config_lock;
108         struct gendisk *disk;
109
110         struct list_head list;
111         struct task_struct *task_recv;
112         struct task_struct *task_setup;
113 };
114
115 struct nbd_cmd {
116         struct nbd_device *nbd;
117         int index;
118         int cookie;
119         struct completion send_complete;
120         blk_status_t status;
121 };
122
123 #if IS_ENABLED(CONFIG_DEBUG_FS)
124 static struct dentry *nbd_dbg_dir;
125 #endif
126
127 #define nbd_name(nbd) ((nbd)->disk->disk_name)
128
129 #define NBD_MAGIC 0x68797548
130
131 static unsigned int nbds_max = 16;
132 static int max_part = 16;
133 static struct workqueue_struct *recv_workqueue;
134 static int part_shift;
135
136 static int nbd_dev_dbg_init(struct nbd_device *nbd);
137 static void nbd_dev_dbg_close(struct nbd_device *nbd);
138 static void nbd_config_put(struct nbd_device *nbd);
139 static void nbd_connect_reply(struct genl_info *info, int index);
140 static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
141 static void nbd_dead_link_work(struct work_struct *work);
142 static void nbd_disconnect_and_put(struct nbd_device *nbd);
143
144 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
145 {
146         return disk_to_dev(nbd->disk);
147 }
148
149 static const char *nbdcmd_to_ascii(int cmd)
150 {
151         switch (cmd) {
152         case  NBD_CMD_READ: return "read";
153         case NBD_CMD_WRITE: return "write";
154         case  NBD_CMD_DISC: return "disconnect";
155         case NBD_CMD_FLUSH: return "flush";
156         case  NBD_CMD_TRIM: return "trim/discard";
157         }
158         return "invalid";
159 }
160
161 static ssize_t pid_show(struct device *dev,
162                         struct device_attribute *attr, char *buf)
163 {
164         struct gendisk *disk = dev_to_disk(dev);
165         struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
166
167         return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
168 }
169
170 static const struct device_attribute pid_attr = {
171         .attr = { .name = "pid", .mode = 0444},
172         .show = pid_show,
173 };
174
175 static void nbd_dev_remove(struct nbd_device *nbd)
176 {
177         struct gendisk *disk = nbd->disk;
178         struct request_queue *q;
179
180         if (disk) {
181                 q = disk->queue;
182                 del_gendisk(disk);
183                 blk_cleanup_queue(q);
184                 blk_mq_free_tag_set(&nbd->tag_set);
185                 disk->private_data = NULL;
186                 put_disk(disk);
187         }
188         kfree(nbd);
189 }
190
191 static void nbd_put(struct nbd_device *nbd)
192 {
193         if (refcount_dec_and_mutex_lock(&nbd->refs,
194                                         &nbd_index_mutex)) {
195                 idr_remove(&nbd_index_idr, nbd->index);
196                 mutex_unlock(&nbd_index_mutex);
197                 nbd_dev_remove(nbd);
198         }
199 }
200
201 static int nbd_disconnected(struct nbd_config *config)
202 {
203         return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
204                 test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
205 }
206
207 static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
208                                 int notify)
209 {
210         if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
211                 struct link_dead_args *args;
212                 args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
213                 if (args) {
214                         INIT_WORK(&args->work, nbd_dead_link_work);
215                         args->index = nbd->index;
216                         queue_work(system_wq, &args->work);
217                 }
218         }
219         if (!nsock->dead) {
220                 kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
221                 if (atomic_dec_return(&nbd->config->live_connections) == 0) {
222                         if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
223                                                &nbd->config->runtime_flags)) {
224                                 set_bit(NBD_DISCONNECTED,
225                                         &nbd->config->runtime_flags);
226                                 dev_info(nbd_to_dev(nbd),
227                                         "Disconnected due to user request.\n");
228                         }
229                 }
230         }
231         nsock->dead = true;
232         nsock->pending = NULL;
233         nsock->sent = 0;
234 }
235
236 static void nbd_size_clear(struct nbd_device *nbd)
237 {
238         if (nbd->config->bytesize) {
239                 set_capacity(nbd->disk, 0);
240                 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
241         }
242 }
243
244 static void nbd_size_update(struct nbd_device *nbd)
245 {
246         struct nbd_config *config = nbd->config;
247         struct block_device *bdev = bdget_disk(nbd->disk, 0);
248
249         if (config->flags & NBD_FLAG_SEND_TRIM) {
250                 nbd->disk->queue->limits.discard_granularity = config->blksize;
251                 nbd->disk->queue->limits.discard_alignment = config->blksize;
252                 blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
253         }
254         blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
255         blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
256         set_capacity(nbd->disk, config->bytesize >> 9);
257         if (bdev) {
258                 if (bdev->bd_disk)
259                         bd_set_size(bdev, config->bytesize);
260                 else
261                         bdev->bd_invalidated = 1;
262                 bdput(bdev);
263         }
264         kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
265 }
266
267 static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
268                          loff_t nr_blocks)
269 {
270         struct nbd_config *config = nbd->config;
271         config->blksize = blocksize;
272         config->bytesize = blocksize * nr_blocks;
273         if (nbd->task_recv != NULL)
274                 nbd_size_update(nbd);
275 }
276
277 static void nbd_complete_rq(struct request *req)
278 {
279         struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
280
281         dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
282                 cmd->status ? "failed" : "done");
283
284         blk_mq_end_request(req, cmd->status);
285 }
286
287 /*
288  * Forcibly shutdown the socket causing all listeners to error
289  */
290 static void sock_shutdown(struct nbd_device *nbd)
291 {
292         struct nbd_config *config = nbd->config;
293         int i;
294
295         if (config->num_connections == 0)
296                 return;
297         if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
298                 return;
299
300         for (i = 0; i < config->num_connections; i++) {
301                 struct nbd_sock *nsock = config->socks[i];
302                 mutex_lock(&nsock->tx_lock);
303                 nbd_mark_nsock_dead(nbd, nsock, 0);
304                 mutex_unlock(&nsock->tx_lock);
305         }
306         dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
307 }
308
309 static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
310                                                  bool reserved)
311 {
312         struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
313         struct nbd_device *nbd = cmd->nbd;
314         struct nbd_config *config;
315
316         if (!refcount_inc_not_zero(&nbd->config_refs)) {
317                 cmd->status = BLK_STS_TIMEOUT;
318                 goto done;
319         }
320         config = nbd->config;
321
322         if (config->num_connections > 1) {
323                 dev_err_ratelimited(nbd_to_dev(nbd),
324                                     "Connection timed out, retrying (%d/%d alive)\n",
325                                     atomic_read(&config->live_connections),
326                                     config->num_connections);
327                 /*
328                  * Hooray we have more connections, requeue this IO, the submit
329                  * path will put it on a real connection.
330                  */
331                 if (config->socks && config->num_connections > 1) {
332                         if (cmd->index < config->num_connections) {
333                                 struct nbd_sock *nsock =
334                                         config->socks[cmd->index];
335                                 mutex_lock(&nsock->tx_lock);
336                                 /* We can have multiple outstanding requests, so
337                                  * we don't want to mark the nsock dead if we've
338                                  * already reconnected with a new socket, so
339                                  * only mark it dead if its the same socket we
340                                  * were sent out on.
341                                  */
342                                 if (cmd->cookie == nsock->cookie)
343                                         nbd_mark_nsock_dead(nbd, nsock, 1);
344                                 mutex_unlock(&nsock->tx_lock);
345                         }
346                         blk_mq_requeue_request(req, true);
347                         nbd_config_put(nbd);
348                         return BLK_EH_DONE;
349                 }
350         } else {
351                 dev_err_ratelimited(nbd_to_dev(nbd),
352                                     "Connection timed out\n");
353         }
354         set_bit(NBD_TIMEDOUT, &config->runtime_flags);
355         cmd->status = BLK_STS_IOERR;
356         sock_shutdown(nbd);
357         nbd_config_put(nbd);
358 done:
359         blk_mq_complete_request(req);
360         return BLK_EH_DONE;
361 }
362
363 /*
364  *  Send or receive packet.
365  */
366 static int sock_xmit(struct nbd_device *nbd, int index, int send,
367                      struct iov_iter *iter, int msg_flags, int *sent)
368 {
369         struct nbd_config *config = nbd->config;
370         struct socket *sock = config->socks[index]->sock;
371         int result;
372         struct msghdr msg;
373         unsigned int noreclaim_flag;
374
375         if (unlikely(!sock)) {
376                 dev_err_ratelimited(disk_to_dev(nbd->disk),
377                         "Attempted %s on closed socket in sock_xmit\n",
378                         (send ? "send" : "recv"));
379                 return -EINVAL;
380         }
381
382         msg.msg_iter = *iter;
383
384         noreclaim_flag = memalloc_noreclaim_save();
385         do {
386                 sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
387                 msg.msg_name = NULL;
388                 msg.msg_namelen = 0;
389                 msg.msg_control = NULL;
390                 msg.msg_controllen = 0;
391                 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
392
393                 if (send)
394                         result = sock_sendmsg(sock, &msg);
395                 else
396                         result = sock_recvmsg(sock, &msg, msg.msg_flags);
397
398                 if (result <= 0) {
399                         if (result == 0)
400                                 result = -EPIPE; /* short read */
401                         break;
402                 }
403                 if (sent)
404                         *sent += result;
405         } while (msg_data_left(&msg));
406
407         memalloc_noreclaim_restore(noreclaim_flag);
408
409         return result;
410 }
411
412 /*
413  * Different settings for sk->sk_sndtimeo can result in different return values
414  * if there is a signal pending when we enter sendmsg, because reasons?
415  */
416 static inline int was_interrupted(int result)
417 {
418         return result == -ERESTARTSYS || result == -EINTR;
419 }
420
421 /* always call with the tx_lock held */
422 static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
423 {
424         struct request *req = blk_mq_rq_from_pdu(cmd);
425         struct nbd_config *config = nbd->config;
426         struct nbd_sock *nsock = config->socks[index];
427         int result;
428         struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
429         struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
430         struct iov_iter from;
431         unsigned long size = blk_rq_bytes(req);
432         struct bio *bio;
433         u32 type;
434         u32 nbd_cmd_flags = 0;
435         u32 tag = blk_mq_unique_tag(req);
436         int sent = nsock->sent, skip = 0;
437
438         iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
439
440         switch (req_op(req)) {
441         case REQ_OP_DISCARD:
442                 type = NBD_CMD_TRIM;
443                 break;
444         case REQ_OP_FLUSH:
445                 type = NBD_CMD_FLUSH;
446                 break;
447         case REQ_OP_WRITE:
448                 type = NBD_CMD_WRITE;
449                 break;
450         case REQ_OP_READ:
451                 type = NBD_CMD_READ;
452                 break;
453         default:
454                 return -EIO;
455         }
456
457         if (rq_data_dir(req) == WRITE &&
458             (config->flags & NBD_FLAG_READ_ONLY)) {
459                 dev_err_ratelimited(disk_to_dev(nbd->disk),
460                                     "Write on read-only\n");
461                 return -EIO;
462         }
463
464         if (req->cmd_flags & REQ_FUA)
465                 nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
466
467         /* We did a partial send previously, and we at least sent the whole
468          * request struct, so just go and send the rest of the pages in the
469          * request.
470          */
471         if (sent) {
472                 if (sent >= sizeof(request)) {
473                         skip = sent - sizeof(request);
474                         goto send_pages;
475                 }
476                 iov_iter_advance(&from, sent);
477         }
478         cmd->index = index;
479         cmd->cookie = nsock->cookie;
480         request.type = htonl(type | nbd_cmd_flags);
481         if (type != NBD_CMD_FLUSH) {
482                 request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
483                 request.len = htonl(size);
484         }
485         memcpy(request.handle, &tag, sizeof(tag));
486
487         dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
488                 req, nbdcmd_to_ascii(type),
489                 (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
490         result = sock_xmit(nbd, index, 1, &from,
491                         (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
492         if (result <= 0) {
493                 if (was_interrupted(result)) {
494                         /* If we havne't sent anything we can just return BUSY,
495                          * however if we have sent something we need to make
496                          * sure we only allow this req to be sent until we are
497                          * completely done.
498                          */
499                         if (sent) {
500                                 nsock->pending = req;
501                                 nsock->sent = sent;
502                         }
503                         return BLK_STS_RESOURCE;
504                 }
505                 dev_err_ratelimited(disk_to_dev(nbd->disk),
506                         "Send control failed (result %d)\n", result);
507                 return -EAGAIN;
508         }
509 send_pages:
510         if (type != NBD_CMD_WRITE)
511                 goto out;
512
513         bio = req->bio;
514         while (bio) {
515                 struct bio *next = bio->bi_next;
516                 struct bvec_iter iter;
517                 struct bio_vec bvec;
518
519                 bio_for_each_segment(bvec, bio, iter) {
520                         bool is_last = !next && bio_iter_last(bvec, iter);
521                         int flags = is_last ? 0 : MSG_MORE;
522
523                         dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
524                                 req, bvec.bv_len);
525                         iov_iter_bvec(&from, ITER_BVEC | WRITE,
526                                       &bvec, 1, bvec.bv_len);
527                         if (skip) {
528                                 if (skip >= iov_iter_count(&from)) {
529                                         skip -= iov_iter_count(&from);
530                                         continue;
531                                 }
532                                 iov_iter_advance(&from, skip);
533                                 skip = 0;
534                         }
535                         result = sock_xmit(nbd, index, 1, &from, flags, &sent);
536                         if (result <= 0) {
537                                 if (was_interrupted(result)) {
538                                         /* We've already sent the header, we
539                                          * have no choice but to set pending and
540                                          * return BUSY.
541                                          */
542                                         nsock->pending = req;
543                                         nsock->sent = sent;
544                                         return BLK_STS_RESOURCE;
545                                 }
546                                 dev_err(disk_to_dev(nbd->disk),
547                                         "Send data failed (result %d)\n",
548                                         result);
549                                 return -EAGAIN;
550                         }
551                         /*
552                          * The completion might already have come in,
553                          * so break for the last one instead of letting
554                          * the iterator do it. This prevents use-after-free
555                          * of the bio.
556                          */
557                         if (is_last)
558                                 break;
559                 }
560                 bio = next;
561         }
562 out:
563         nsock->pending = NULL;
564         nsock->sent = 0;
565         return 0;
566 }
567
568 /* NULL returned = something went wrong, inform userspace */
569 static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
570 {
571         struct nbd_config *config = nbd->config;
572         int result;
573         struct nbd_reply reply;
574         struct nbd_cmd *cmd;
575         struct request *req = NULL;
576         u16 hwq;
577         u32 tag;
578         struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
579         struct iov_iter to;
580
581         reply.magic = 0;
582         iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
583         result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
584         if (result <= 0) {
585                 if (!nbd_disconnected(config))
586                         dev_err(disk_to_dev(nbd->disk),
587                                 "Receive control failed (result %d)\n", result);
588                 return ERR_PTR(result);
589         }
590
591         if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
592                 dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
593                                 (unsigned long)ntohl(reply.magic));
594                 return ERR_PTR(-EPROTO);
595         }
596
597         memcpy(&tag, reply.handle, sizeof(u32));
598
599         hwq = blk_mq_unique_tag_to_hwq(tag);
600         if (hwq < nbd->tag_set.nr_hw_queues)
601                 req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
602                                        blk_mq_unique_tag_to_tag(tag));
603         if (!req || !blk_mq_request_started(req)) {
604                 dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
605                         tag, req);
606                 return ERR_PTR(-ENOENT);
607         }
608         cmd = blk_mq_rq_to_pdu(req);
609         if (ntohl(reply.error)) {
610                 dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
611                         ntohl(reply.error));
612                 cmd->status = BLK_STS_IOERR;
613                 return cmd;
614         }
615
616         dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
617         if (rq_data_dir(req) != WRITE) {
618                 struct req_iterator iter;
619                 struct bio_vec bvec;
620
621                 rq_for_each_segment(bvec, req, iter) {
622                         iov_iter_bvec(&to, ITER_BVEC | READ,
623                                       &bvec, 1, bvec.bv_len);
624                         result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
625                         if (result <= 0) {
626                                 dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
627                                         result);
628                                 /*
629                                  * If we've disconnected or we only have 1
630                                  * connection then we need to make sure we
631                                  * complete this request, otherwise error out
632                                  * and let the timeout stuff handle resubmitting
633                                  * this request onto another connection.
634                                  */
635                                 if (nbd_disconnected(config) ||
636                                     config->num_connections <= 1) {
637                                         cmd->status = BLK_STS_IOERR;
638                                         return cmd;
639                                 }
640                                 return ERR_PTR(-EIO);
641                         }
642                         dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
643                                 req, bvec.bv_len);
644                 }
645         } else {
646                 /* See the comment in nbd_queue_rq. */
647                 wait_for_completion(&cmd->send_complete);
648         }
649         return cmd;
650 }
651
652 static void recv_work(struct work_struct *work)
653 {
654         struct recv_thread_args *args = container_of(work,
655                                                      struct recv_thread_args,
656                                                      work);
657         struct nbd_device *nbd = args->nbd;
658         struct nbd_config *config = nbd->config;
659         struct nbd_cmd *cmd;
660
661         while (1) {
662                 cmd = nbd_read_stat(nbd, args->index);
663                 if (IS_ERR(cmd)) {
664                         struct nbd_sock *nsock = config->socks[args->index];
665
666                         mutex_lock(&nsock->tx_lock);
667                         nbd_mark_nsock_dead(nbd, nsock, 1);
668                         mutex_unlock(&nsock->tx_lock);
669                         break;
670                 }
671
672                 blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
673         }
674         atomic_dec(&config->recv_threads);
675         wake_up(&config->recv_wq);
676         nbd_config_put(nbd);
677         kfree(args);
678 }
679
680 static void nbd_clear_req(struct request *req, void *data, bool reserved)
681 {
682         struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
683
684         cmd->status = BLK_STS_IOERR;
685         blk_mq_complete_request(req);
686 }
687
688 static void nbd_clear_que(struct nbd_device *nbd)
689 {
690         blk_mq_quiesce_queue(nbd->disk->queue);
691         blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
692         blk_mq_unquiesce_queue(nbd->disk->queue);
693         dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
694 }
695
696 static int find_fallback(struct nbd_device *nbd, int index)
697 {
698         struct nbd_config *config = nbd->config;
699         int new_index = -1;
700         struct nbd_sock *nsock = config->socks[index];
701         int fallback = nsock->fallback_index;
702
703         if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
704                 return new_index;
705
706         if (config->num_connections <= 1) {
707                 dev_err_ratelimited(disk_to_dev(nbd->disk),
708                                     "Attempted send on invalid socket\n");
709                 return new_index;
710         }
711
712         if (fallback >= 0 && fallback < config->num_connections &&
713             !config->socks[fallback]->dead)
714                 return fallback;
715
716         if (nsock->fallback_index < 0 ||
717             nsock->fallback_index >= config->num_connections ||
718             config->socks[nsock->fallback_index]->dead) {
719                 int i;
720                 for (i = 0; i < config->num_connections; i++) {
721                         if (i == index)
722                                 continue;
723                         if (!config->socks[i]->dead) {
724                                 new_index = i;
725                                 break;
726                         }
727                 }
728                 nsock->fallback_index = new_index;
729                 if (new_index < 0) {
730                         dev_err_ratelimited(disk_to_dev(nbd->disk),
731                                             "Dead connection, failed to find a fallback\n");
732                         return new_index;
733                 }
734         }
735         new_index = nsock->fallback_index;
736         return new_index;
737 }
738
739 static int wait_for_reconnect(struct nbd_device *nbd)
740 {
741         struct nbd_config *config = nbd->config;
742         if (!config->dead_conn_timeout)
743                 return 0;
744         if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
745                 return 0;
746         return wait_event_timeout(config->conn_wait,
747                                   atomic_read(&config->live_connections) > 0,
748                                   config->dead_conn_timeout) > 0;
749 }
750
751 static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
752 {
753         struct request *req = blk_mq_rq_from_pdu(cmd);
754         struct nbd_device *nbd = cmd->nbd;
755         struct nbd_config *config;
756         struct nbd_sock *nsock;
757         int ret;
758
759         if (!refcount_inc_not_zero(&nbd->config_refs)) {
760                 dev_err_ratelimited(disk_to_dev(nbd->disk),
761                                     "Socks array is empty\n");
762                 blk_mq_start_request(req);
763                 return -EINVAL;
764         }
765         config = nbd->config;
766
767         if (index >= config->num_connections) {
768                 dev_err_ratelimited(disk_to_dev(nbd->disk),
769                                     "Attempted send on invalid socket\n");
770                 nbd_config_put(nbd);
771                 blk_mq_start_request(req);
772                 return -EINVAL;
773         }
774         cmd->status = BLK_STS_OK;
775 again:
776         nsock = config->socks[index];
777         mutex_lock(&nsock->tx_lock);
778         if (nsock->dead) {
779                 int old_index = index;
780                 index = find_fallback(nbd, index);
781                 mutex_unlock(&nsock->tx_lock);
782                 if (index < 0) {
783                         if (wait_for_reconnect(nbd)) {
784                                 index = old_index;
785                                 goto again;
786                         }
787                         /* All the sockets should already be down at this point,
788                          * we just want to make sure that DISCONNECTED is set so
789                          * any requests that come in that were queue'ed waiting
790                          * for the reconnect timer don't trigger the timer again
791                          * and instead just error out.
792                          */
793                         sock_shutdown(nbd);
794                         nbd_config_put(nbd);
795                         blk_mq_start_request(req);
796                         return -EIO;
797                 }
798                 goto again;
799         }
800
801         /* Handle the case that we have a pending request that was partially
802          * transmitted that _has_ to be serviced first.  We need to call requeue
803          * here so that it gets put _after_ the request that is already on the
804          * dispatch list.
805          */
806         blk_mq_start_request(req);
807         if (unlikely(nsock->pending && nsock->pending != req)) {
808                 blk_mq_requeue_request(req, true);
809                 ret = 0;
810                 goto out;
811         }
812         /*
813          * Some failures are related to the link going down, so anything that
814          * returns EAGAIN can be retried on a different socket.
815          */
816         ret = nbd_send_cmd(nbd, cmd, index);
817         if (ret == -EAGAIN) {
818                 dev_err_ratelimited(disk_to_dev(nbd->disk),
819                                     "Request send failed, requeueing\n");
820                 nbd_mark_nsock_dead(nbd, nsock, 1);
821                 blk_mq_requeue_request(req, true);
822                 ret = 0;
823         }
824 out:
825         mutex_unlock(&nsock->tx_lock);
826         nbd_config_put(nbd);
827         return ret;
828 }
829
830 static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
831                         const struct blk_mq_queue_data *bd)
832 {
833         struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
834         int ret;
835
836         /*
837          * Since we look at the bio's to send the request over the network we
838          * need to make sure the completion work doesn't mark this request done
839          * before we are done doing our send.  This keeps us from dereferencing
840          * freed data if we have particularly fast completions (ie we get the
841          * completion before we exit sock_xmit on the last bvec) or in the case
842          * that the server is misbehaving (or there was an error) before we're
843          * done sending everything over the wire.
844          */
845         init_completion(&cmd->send_complete);
846
847         /* We can be called directly from the user space process, which means we
848          * could possibly have signals pending so our sendmsg will fail.  In
849          * this case we need to return that we are busy, otherwise error out as
850          * appropriate.
851          */
852         ret = nbd_handle_cmd(cmd, hctx->queue_num);
853         if (ret < 0)
854                 ret = BLK_STS_IOERR;
855         else if (!ret)
856                 ret = BLK_STS_OK;
857         complete(&cmd->send_complete);
858
859         return ret;
860 }
861
862 static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
863                           bool netlink)
864 {
865         struct nbd_config *config = nbd->config;
866         struct socket *sock;
867         struct nbd_sock **socks;
868         struct nbd_sock *nsock;
869         int err;
870
871         sock = sockfd_lookup(arg, &err);
872         if (!sock)
873                 return err;
874
875         if (!netlink && !nbd->task_setup &&
876             !test_bit(NBD_BOUND, &config->runtime_flags))
877                 nbd->task_setup = current;
878
879         if (!netlink &&
880             (nbd->task_setup != current ||
881              test_bit(NBD_BOUND, &config->runtime_flags))) {
882                 dev_err(disk_to_dev(nbd->disk),
883                         "Device being setup by another task");
884                 sockfd_put(sock);
885                 return -EBUSY;
886         }
887
888         socks = krealloc(config->socks, (config->num_connections + 1) *
889                          sizeof(struct nbd_sock *), GFP_KERNEL);
890         if (!socks) {
891                 sockfd_put(sock);
892                 return -ENOMEM;
893         }
894         nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
895         if (!nsock) {
896                 sockfd_put(sock);
897                 return -ENOMEM;
898         }
899
900         config->socks = socks;
901
902         nsock->fallback_index = -1;
903         nsock->dead = false;
904         mutex_init(&nsock->tx_lock);
905         nsock->sock = sock;
906         nsock->pending = NULL;
907         nsock->sent = 0;
908         nsock->cookie = 0;
909         socks[config->num_connections++] = nsock;
910         atomic_inc(&config->live_connections);
911
912         return 0;
913 }
914
915 static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
916 {
917         struct nbd_config *config = nbd->config;
918         struct socket *sock, *old;
919         struct recv_thread_args *args;
920         int i;
921         int err;
922
923         sock = sockfd_lookup(arg, &err);
924         if (!sock)
925                 return err;
926
927         args = kzalloc(sizeof(*args), GFP_KERNEL);
928         if (!args) {
929                 sockfd_put(sock);
930                 return -ENOMEM;
931         }
932
933         for (i = 0; i < config->num_connections; i++) {
934                 struct nbd_sock *nsock = config->socks[i];
935
936                 if (!nsock->dead)
937                         continue;
938
939                 mutex_lock(&nsock->tx_lock);
940                 if (!nsock->dead) {
941                         mutex_unlock(&nsock->tx_lock);
942                         continue;
943                 }
944                 sk_set_memalloc(sock->sk);
945                 if (nbd->tag_set.timeout)
946                         sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
947                 atomic_inc(&config->recv_threads);
948                 refcount_inc(&nbd->config_refs);
949                 old = nsock->sock;
950                 nsock->fallback_index = -1;
951                 nsock->sock = sock;
952                 nsock->dead = false;
953                 INIT_WORK(&args->work, recv_work);
954                 args->index = i;
955                 args->nbd = nbd;
956                 nsock->cookie++;
957                 mutex_unlock(&nsock->tx_lock);
958                 sockfd_put(old);
959
960                 clear_bit(NBD_DISCONNECTED, &config->runtime_flags);
961
962                 /* We take the tx_mutex in an error path in the recv_work, so we
963                  * need to queue_work outside of the tx_mutex.
964                  */
965                 queue_work(recv_workqueue, &args->work);
966
967                 atomic_inc(&config->live_connections);
968                 wake_up(&config->conn_wait);
969                 return 0;
970         }
971         sockfd_put(sock);
972         kfree(args);
973         return -ENOSPC;
974 }
975
976 static void nbd_bdev_reset(struct block_device *bdev)
977 {
978         if (bdev->bd_openers > 1)
979                 return;
980         bd_set_size(bdev, 0);
981 }
982
983 static void nbd_parse_flags(struct nbd_device *nbd)
984 {
985         struct nbd_config *config = nbd->config;
986         if (config->flags & NBD_FLAG_READ_ONLY)
987                 set_disk_ro(nbd->disk, true);
988         else
989                 set_disk_ro(nbd->disk, false);
990         if (config->flags & NBD_FLAG_SEND_TRIM)
991                 blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
992         if (config->flags & NBD_FLAG_SEND_FLUSH) {
993                 if (config->flags & NBD_FLAG_SEND_FUA)
994                         blk_queue_write_cache(nbd->disk->queue, true, true);
995                 else
996                         blk_queue_write_cache(nbd->disk->queue, true, false);
997         }
998         else
999                 blk_queue_write_cache(nbd->disk->queue, false, false);
1000 }
1001
1002 static void send_disconnects(struct nbd_device *nbd)
1003 {
1004         struct nbd_config *config = nbd->config;
1005         struct nbd_request request = {
1006                 .magic = htonl(NBD_REQUEST_MAGIC),
1007                 .type = htonl(NBD_CMD_DISC),
1008         };
1009         struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
1010         struct iov_iter from;
1011         int i, ret;
1012
1013         for (i = 0; i < config->num_connections; i++) {
1014                 struct nbd_sock *nsock = config->socks[i];
1015
1016                 iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
1017                 mutex_lock(&nsock->tx_lock);
1018                 ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
1019                 if (ret <= 0)
1020                         dev_err(disk_to_dev(nbd->disk),
1021                                 "Send disconnect failed %d\n", ret);
1022                 mutex_unlock(&nsock->tx_lock);
1023         }
1024 }
1025
1026 static int nbd_disconnect(struct nbd_device *nbd)
1027 {
1028         struct nbd_config *config = nbd->config;
1029
1030         dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
1031         set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
1032         send_disconnects(nbd);
1033         return 0;
1034 }
1035
1036 static void nbd_clear_sock(struct nbd_device *nbd)
1037 {
1038         sock_shutdown(nbd);
1039         nbd_clear_que(nbd);
1040         nbd->task_setup = NULL;
1041 }
1042
1043 static void nbd_config_put(struct nbd_device *nbd)
1044 {
1045         if (refcount_dec_and_mutex_lock(&nbd->config_refs,
1046                                         &nbd->config_lock)) {
1047                 struct nbd_config *config = nbd->config;
1048                 nbd_dev_dbg_close(nbd);
1049                 nbd_size_clear(nbd);
1050                 if (test_and_clear_bit(NBD_HAS_PID_FILE,
1051                                        &config->runtime_flags))
1052                         device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
1053                 nbd->task_recv = NULL;
1054                 nbd_clear_sock(nbd);
1055                 if (config->num_connections) {
1056                         int i;
1057                         for (i = 0; i < config->num_connections; i++) {
1058                                 sockfd_put(config->socks[i]->sock);
1059                                 kfree(config->socks[i]);
1060                         }
1061                         kfree(config->socks);
1062                 }
1063                 kfree(nbd->config);
1064                 nbd->config = NULL;
1065
1066                 nbd->tag_set.timeout = 0;
1067                 nbd->disk->queue->limits.discard_granularity = 0;
1068                 nbd->disk->queue->limits.discard_alignment = 0;
1069                 blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
1070                 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1071
1072                 mutex_unlock(&nbd->config_lock);
1073                 nbd_put(nbd);
1074                 module_put(THIS_MODULE);
1075         }
1076 }
1077
1078 static int nbd_start_device(struct nbd_device *nbd)
1079 {
1080         struct nbd_config *config = nbd->config;
1081         int num_connections = config->num_connections;
1082         int error = 0, i;
1083
1084         if (nbd->task_recv)
1085                 return -EBUSY;
1086         if (!config->socks)
1087                 return -EINVAL;
1088         if (num_connections > 1 &&
1089             !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
1090                 dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
1091                 return -EINVAL;
1092         }
1093
1094         blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
1095         nbd->task_recv = current;
1096
1097         nbd_parse_flags(nbd);
1098
1099         error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
1100         if (error) {
1101                 dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
1102                 return error;
1103         }
1104         set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
1105
1106         nbd_dev_dbg_init(nbd);
1107         for (i = 0; i < num_connections; i++) {
1108                 struct recv_thread_args *args;
1109
1110                 args = kzalloc(sizeof(*args), GFP_KERNEL);
1111                 if (!args) {
1112                         sock_shutdown(nbd);
1113                         return -ENOMEM;
1114                 }
1115                 sk_set_memalloc(config->socks[i]->sock->sk);
1116                 if (nbd->tag_set.timeout)
1117                         config->socks[i]->sock->sk->sk_sndtimeo =
1118                                 nbd->tag_set.timeout;
1119                 atomic_inc(&config->recv_threads);
1120                 refcount_inc(&nbd->config_refs);
1121                 INIT_WORK(&args->work, recv_work);
1122                 args->nbd = nbd;
1123                 args->index = i;
1124                 queue_work(recv_workqueue, &args->work);
1125         }
1126         nbd_size_update(nbd);
1127         return error;
1128 }
1129
1130 static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
1131 {
1132         struct nbd_config *config = nbd->config;
1133         int ret;
1134
1135         ret = nbd_start_device(nbd);
1136         if (ret)
1137                 return ret;
1138
1139         if (max_part)
1140                 bdev->bd_invalidated = 1;
1141         mutex_unlock(&nbd->config_lock);
1142         ret = wait_event_interruptible(config->recv_wq,
1143                                          atomic_read(&config->recv_threads) == 0);
1144         if (ret)
1145                 sock_shutdown(nbd);
1146         mutex_lock(&nbd->config_lock);
1147         nbd_bdev_reset(bdev);
1148         /* user requested, ignore socket errors */
1149         if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
1150                 ret = 0;
1151         if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
1152                 ret = -ETIMEDOUT;
1153         return ret;
1154 }
1155
1156 static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
1157                                  struct block_device *bdev)
1158 {
1159         sock_shutdown(nbd);
1160         kill_bdev(bdev);
1161         nbd_bdev_reset(bdev);
1162         if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
1163                                &nbd->config->runtime_flags))
1164                 nbd_config_put(nbd);
1165 }
1166
1167 /* Must be called with config_lock held */
1168 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1169                        unsigned int cmd, unsigned long arg)
1170 {
1171         struct nbd_config *config = nbd->config;
1172
1173         switch (cmd) {
1174         case NBD_DISCONNECT:
1175                 return nbd_disconnect(nbd);
1176         case NBD_CLEAR_SOCK:
1177                 nbd_clear_sock_ioctl(nbd, bdev);
1178                 return 0;
1179         case NBD_SET_SOCK:
1180                 return nbd_add_socket(nbd, arg, false);
1181         case NBD_SET_BLKSIZE:
1182                 nbd_size_set(nbd, arg,
1183                              div_s64(config->bytesize, arg));
1184                 return 0;
1185         case NBD_SET_SIZE:
1186                 nbd_size_set(nbd, config->blksize,
1187                              div_s64(arg, config->blksize));
1188                 return 0;
1189         case NBD_SET_SIZE_BLOCKS:
1190                 nbd_size_set(nbd, config->blksize, arg);
1191                 return 0;
1192         case NBD_SET_TIMEOUT:
1193                 if (arg) {
1194                         nbd->tag_set.timeout = arg * HZ;
1195                         blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
1196                 }
1197                 return 0;
1198
1199         case NBD_SET_FLAGS:
1200                 config->flags = arg;
1201                 return 0;
1202         case NBD_DO_IT:
1203                 return nbd_start_device_ioctl(nbd, bdev);
1204         case NBD_CLEAR_QUE:
1205                 /*
1206                  * This is for compatibility only.  The queue is always cleared
1207                  * by NBD_DO_IT or NBD_CLEAR_SOCK.
1208                  */
1209                 return 0;
1210         case NBD_PRINT_DEBUG:
1211                 /*
1212                  * For compatibility only, we no longer keep a list of
1213                  * outstanding requests.
1214                  */
1215                 return 0;
1216         }
1217         return -ENOTTY;
1218 }
1219
1220 static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
1221                      unsigned int cmd, unsigned long arg)
1222 {
1223         struct nbd_device *nbd = bdev->bd_disk->private_data;
1224         struct nbd_config *config = nbd->config;
1225         int error = -EINVAL;
1226
1227         if (!capable(CAP_SYS_ADMIN))
1228                 return -EPERM;
1229
1230         /* The block layer will pass back some non-nbd ioctls in case we have
1231          * special handling for them, but we don't so just return an error.
1232          */
1233         if (_IOC_TYPE(cmd) != 0xab)
1234                 return -EINVAL;
1235
1236         mutex_lock(&nbd->config_lock);
1237
1238         /* Don't allow ioctl operations on a nbd device that was created with
1239          * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
1240          */
1241         if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
1242             (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
1243                 error = __nbd_ioctl(bdev, nbd, cmd, arg);
1244         else
1245                 dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
1246         mutex_unlock(&nbd->config_lock);
1247         return error;
1248 }
1249
1250 static struct nbd_config *nbd_alloc_config(void)
1251 {
1252         struct nbd_config *config;
1253
1254         config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
1255         if (!config)
1256                 return NULL;
1257         atomic_set(&config->recv_threads, 0);
1258         init_waitqueue_head(&config->recv_wq);
1259         init_waitqueue_head(&config->conn_wait);
1260         config->blksize = 1024;
1261         atomic_set(&config->live_connections, 0);
1262         try_module_get(THIS_MODULE);
1263         return config;
1264 }
1265
1266 static int nbd_open(struct block_device *bdev, fmode_t mode)
1267 {
1268         struct nbd_device *nbd;
1269         int ret = 0;
1270
1271         mutex_lock(&nbd_index_mutex);
1272         nbd = bdev->bd_disk->private_data;
1273         if (!nbd) {
1274                 ret = -ENXIO;
1275                 goto out;
1276         }
1277         if (!refcount_inc_not_zero(&nbd->refs)) {
1278                 ret = -ENXIO;
1279                 goto out;
1280         }
1281         if (!refcount_inc_not_zero(&nbd->config_refs)) {
1282                 struct nbd_config *config;
1283
1284                 mutex_lock(&nbd->config_lock);
1285                 if (refcount_inc_not_zero(&nbd->config_refs)) {
1286                         mutex_unlock(&nbd->config_lock);
1287                         goto out;
1288                 }
1289                 config = nbd->config = nbd_alloc_config();
1290                 if (!config) {
1291                         ret = -ENOMEM;
1292                         mutex_unlock(&nbd->config_lock);
1293                         goto out;
1294                 }
1295                 refcount_set(&nbd->config_refs, 1);
1296                 refcount_inc(&nbd->refs);
1297                 mutex_unlock(&nbd->config_lock);
1298                 bdev->bd_invalidated = 1;
1299         } else if (nbd_disconnected(nbd->config)) {
1300                 bdev->bd_invalidated = 1;
1301         }
1302 out:
1303         mutex_unlock(&nbd_index_mutex);
1304         return ret;
1305 }
1306
1307 static void nbd_release(struct gendisk *disk, fmode_t mode)
1308 {
1309         struct nbd_device *nbd = disk->private_data;
1310         struct block_device *bdev = bdget_disk(disk, 0);
1311
1312         if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
1313                         bdev->bd_openers == 0)
1314                 nbd_disconnect_and_put(nbd);
1315
1316         nbd_config_put(nbd);
1317         nbd_put(nbd);
1318 }
1319
1320 static const struct block_device_operations nbd_fops =
1321 {
1322         .owner =        THIS_MODULE,
1323         .open =         nbd_open,
1324         .release =      nbd_release,
1325         .ioctl =        nbd_ioctl,
1326         .compat_ioctl = nbd_ioctl,
1327 };
1328
1329 #if IS_ENABLED(CONFIG_DEBUG_FS)
1330
1331 static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
1332 {
1333         struct nbd_device *nbd = s->private;
1334
1335         if (nbd->task_recv)
1336                 seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
1337
1338         return 0;
1339 }
1340
1341 static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
1342 {
1343         return single_open(file, nbd_dbg_tasks_show, inode->i_private);
1344 }
1345
1346 static const struct file_operations nbd_dbg_tasks_ops = {
1347         .open = nbd_dbg_tasks_open,
1348         .read = seq_read,
1349         .llseek = seq_lseek,
1350         .release = single_release,
1351 };
1352
1353 static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
1354 {
1355         struct nbd_device *nbd = s->private;
1356         u32 flags = nbd->config->flags;
1357
1358         seq_printf(s, "Hex: 0x%08x\n\n", flags);
1359
1360         seq_puts(s, "Known flags:\n");
1361
1362         if (flags & NBD_FLAG_HAS_FLAGS)
1363                 seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
1364         if (flags & NBD_FLAG_READ_ONLY)
1365                 seq_puts(s, "NBD_FLAG_READ_ONLY\n");
1366         if (flags & NBD_FLAG_SEND_FLUSH)
1367                 seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
1368         if (flags & NBD_FLAG_SEND_FUA)
1369                 seq_puts(s, "NBD_FLAG_SEND_FUA\n");
1370         if (flags & NBD_FLAG_SEND_TRIM)
1371                 seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
1372
1373         return 0;
1374 }
1375
1376 static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
1377 {
1378         return single_open(file, nbd_dbg_flags_show, inode->i_private);
1379 }
1380
1381 static const struct file_operations nbd_dbg_flags_ops = {
1382         .open = nbd_dbg_flags_open,
1383         .read = seq_read,
1384         .llseek = seq_lseek,
1385         .release = single_release,
1386 };
1387
1388 static int nbd_dev_dbg_init(struct nbd_device *nbd)
1389 {
1390         struct dentry *dir;
1391         struct nbd_config *config = nbd->config;
1392
1393         if (!nbd_dbg_dir)
1394                 return -EIO;
1395
1396         dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
1397         if (!dir) {
1398                 dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
1399                         nbd_name(nbd));
1400                 return -EIO;
1401         }
1402         config->dbg_dir = dir;
1403
1404         debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
1405         debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
1406         debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
1407         debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
1408         debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
1409
1410         return 0;
1411 }
1412
1413 static void nbd_dev_dbg_close(struct nbd_device *nbd)
1414 {
1415         debugfs_remove_recursive(nbd->config->dbg_dir);
1416 }
1417
1418 static int nbd_dbg_init(void)
1419 {
1420         struct dentry *dbg_dir;
1421
1422         dbg_dir = debugfs_create_dir("nbd", NULL);
1423         if (!dbg_dir)
1424                 return -EIO;
1425
1426         nbd_dbg_dir = dbg_dir;
1427
1428         return 0;
1429 }
1430
1431 static void nbd_dbg_close(void)
1432 {
1433         debugfs_remove_recursive(nbd_dbg_dir);
1434 }
1435
1436 #else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
1437
1438 static int nbd_dev_dbg_init(struct nbd_device *nbd)
1439 {
1440         return 0;
1441 }
1442
1443 static void nbd_dev_dbg_close(struct nbd_device *nbd)
1444 {
1445 }
1446
1447 static int nbd_dbg_init(void)
1448 {
1449         return 0;
1450 }
1451
1452 static void nbd_dbg_close(void)
1453 {
1454 }
1455
1456 #endif
1457
1458 static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
1459                             unsigned int hctx_idx, unsigned int numa_node)
1460 {
1461         struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
1462         cmd->nbd = set->driver_data;
1463         return 0;
1464 }
1465
1466 static const struct blk_mq_ops nbd_mq_ops = {
1467         .queue_rq       = nbd_queue_rq,
1468         .complete       = nbd_complete_rq,
1469         .init_request   = nbd_init_request,
1470         .timeout        = nbd_xmit_timeout,
1471 };
1472
1473 static int nbd_dev_add(int index)
1474 {
1475         struct nbd_device *nbd;
1476         struct gendisk *disk;
1477         struct request_queue *q;
1478         int err = -ENOMEM;
1479
1480         nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
1481         if (!nbd)
1482                 goto out;
1483
1484         disk = alloc_disk(1 << part_shift);
1485         if (!disk)
1486                 goto out_free_nbd;
1487
1488         if (index >= 0) {
1489                 err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
1490                                 GFP_KERNEL);
1491                 if (err == -ENOSPC)
1492                         err = -EEXIST;
1493         } else {
1494                 err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
1495                 if (err >= 0)
1496                         index = err;
1497         }
1498         if (err < 0)
1499                 goto out_free_disk;
1500
1501         nbd->index = index;
1502         nbd->disk = disk;
1503         nbd->tag_set.ops = &nbd_mq_ops;
1504         nbd->tag_set.nr_hw_queues = 1;
1505         nbd->tag_set.queue_depth = 128;
1506         nbd->tag_set.numa_node = NUMA_NO_NODE;
1507         nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
1508         nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
1509                 BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
1510         nbd->tag_set.driver_data = nbd;
1511
1512         err = blk_mq_alloc_tag_set(&nbd->tag_set);
1513         if (err)
1514                 goto out_free_idr;
1515
1516         q = blk_mq_init_queue(&nbd->tag_set);
1517         if (IS_ERR(q)) {
1518                 err = PTR_ERR(q);
1519                 goto out_free_tags;
1520         }
1521         disk->queue = q;
1522
1523         /*
1524          * Tell the block layer that we are not a rotational device
1525          */
1526         blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
1527         blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
1528         disk->queue->limits.discard_granularity = 0;
1529         disk->queue->limits.discard_alignment = 0;
1530         blk_queue_max_discard_sectors(disk->queue, 0);
1531         blk_queue_max_segment_size(disk->queue, UINT_MAX);
1532         blk_queue_max_segments(disk->queue, USHRT_MAX);
1533         blk_queue_max_hw_sectors(disk->queue, 65536);
1534         disk->queue->limits.max_sectors = 256;
1535
1536         mutex_init(&nbd->config_lock);
1537         refcount_set(&nbd->config_refs, 0);
1538         refcount_set(&nbd->refs, 1);
1539         INIT_LIST_HEAD(&nbd->list);
1540         disk->major = NBD_MAJOR;
1541         disk->first_minor = index << part_shift;
1542         disk->fops = &nbd_fops;
1543         disk->private_data = nbd;
1544         sprintf(disk->disk_name, "nbd%d", index);
1545         add_disk(disk);
1546         nbd_total_devices++;
1547         return index;
1548
1549 out_free_tags:
1550         blk_mq_free_tag_set(&nbd->tag_set);
1551 out_free_idr:
1552         idr_remove(&nbd_index_idr, index);
1553 out_free_disk:
1554         put_disk(disk);
1555 out_free_nbd:
1556         kfree(nbd);
1557 out:
1558         return err;
1559 }
1560
1561 static int find_free_cb(int id, void *ptr, void *data)
1562 {
1563         struct nbd_device *nbd = ptr;
1564         struct nbd_device **found = data;
1565
1566         if (!refcount_read(&nbd->config_refs)) {
1567                 *found = nbd;
1568                 return 1;
1569         }
1570         return 0;
1571 }
1572
1573 /* Netlink interface. */
1574 static struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
1575         [NBD_ATTR_INDEX]                =       { .type = NLA_U32 },
1576         [NBD_ATTR_SIZE_BYTES]           =       { .type = NLA_U64 },
1577         [NBD_ATTR_BLOCK_SIZE_BYTES]     =       { .type = NLA_U64 },
1578         [NBD_ATTR_TIMEOUT]              =       { .type = NLA_U64 },
1579         [NBD_ATTR_SERVER_FLAGS]         =       { .type = NLA_U64 },
1580         [NBD_ATTR_CLIENT_FLAGS]         =       { .type = NLA_U64 },
1581         [NBD_ATTR_SOCKETS]              =       { .type = NLA_NESTED},
1582         [NBD_ATTR_DEAD_CONN_TIMEOUT]    =       { .type = NLA_U64 },
1583         [NBD_ATTR_DEVICE_LIST]          =       { .type = NLA_NESTED},
1584 };
1585
1586 static struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
1587         [NBD_SOCK_FD]                   =       { .type = NLA_U32 },
1588 };
1589
1590 /* We don't use this right now since we don't parse the incoming list, but we
1591  * still want it here so userspace knows what to expect.
1592  */
1593 static struct nla_policy __attribute__((unused))
1594 nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
1595         [NBD_DEVICE_INDEX]              =       { .type = NLA_U32 },
1596         [NBD_DEVICE_CONNECTED]          =       { .type = NLA_U8 },
1597 };
1598
1599 static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
1600 {
1601         struct nbd_device *nbd = NULL;
1602         struct nbd_config *config;
1603         int index = -1;
1604         int ret;
1605         bool put_dev = false;
1606
1607         if (!netlink_capable(skb, CAP_SYS_ADMIN))
1608                 return -EPERM;
1609
1610         if (info->attrs[NBD_ATTR_INDEX])
1611                 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1612         if (!info->attrs[NBD_ATTR_SOCKETS]) {
1613                 printk(KERN_ERR "nbd: must specify at least one socket\n");
1614                 return -EINVAL;
1615         }
1616         if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
1617                 printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
1618                 return -EINVAL;
1619         }
1620 again:
1621         mutex_lock(&nbd_index_mutex);
1622         if (index == -1) {
1623                 ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
1624                 if (ret == 0) {
1625                         int new_index;
1626                         new_index = nbd_dev_add(-1);
1627                         if (new_index < 0) {
1628                                 mutex_unlock(&nbd_index_mutex);
1629                                 printk(KERN_ERR "nbd: failed to add new device\n");
1630                                 return new_index;
1631                         }
1632                         nbd = idr_find(&nbd_index_idr, new_index);
1633                 }
1634         } else {
1635                 nbd = idr_find(&nbd_index_idr, index);
1636                 if (!nbd) {
1637                         ret = nbd_dev_add(index);
1638                         if (ret < 0) {
1639                                 mutex_unlock(&nbd_index_mutex);
1640                                 printk(KERN_ERR "nbd: failed to add new device\n");
1641                                 return ret;
1642                         }
1643                         nbd = idr_find(&nbd_index_idr, index);
1644                 }
1645         }
1646         if (!nbd) {
1647                 printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1648                        index);
1649                 mutex_unlock(&nbd_index_mutex);
1650                 return -EINVAL;
1651         }
1652         if (!refcount_inc_not_zero(&nbd->refs)) {
1653                 mutex_unlock(&nbd_index_mutex);
1654                 if (index == -1)
1655                         goto again;
1656                 printk(KERN_ERR "nbd: device at index %d is going down\n",
1657                        index);
1658                 return -EINVAL;
1659         }
1660         mutex_unlock(&nbd_index_mutex);
1661
1662         mutex_lock(&nbd->config_lock);
1663         if (refcount_read(&nbd->config_refs)) {
1664                 mutex_unlock(&nbd->config_lock);
1665                 nbd_put(nbd);
1666                 if (index == -1)
1667                         goto again;
1668                 printk(KERN_ERR "nbd: nbd%d already in use\n", index);
1669                 return -EBUSY;
1670         }
1671         if (WARN_ON(nbd->config)) {
1672                 mutex_unlock(&nbd->config_lock);
1673                 nbd_put(nbd);
1674                 return -EINVAL;
1675         }
1676         config = nbd->config = nbd_alloc_config();
1677         if (!nbd->config) {
1678                 mutex_unlock(&nbd->config_lock);
1679                 nbd_put(nbd);
1680                 printk(KERN_ERR "nbd: couldn't allocate config\n");
1681                 return -ENOMEM;
1682         }
1683         refcount_set(&nbd->config_refs, 1);
1684         set_bit(NBD_BOUND, &config->runtime_flags);
1685
1686         if (info->attrs[NBD_ATTR_SIZE_BYTES]) {
1687                 u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
1688                 nbd_size_set(nbd, config->blksize,
1689                              div64_u64(bytes, config->blksize));
1690         }
1691         if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
1692                 u64 bsize =
1693                         nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
1694                 nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
1695         }
1696         if (info->attrs[NBD_ATTR_TIMEOUT]) {
1697                 u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
1698                 nbd->tag_set.timeout = timeout * HZ;
1699                 blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1700         }
1701         if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1702                 config->dead_conn_timeout =
1703                         nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1704                 config->dead_conn_timeout *= HZ;
1705         }
1706         if (info->attrs[NBD_ATTR_SERVER_FLAGS])
1707                 config->flags =
1708                         nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
1709         if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1710                 u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1711                 if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1712                         set_bit(NBD_DESTROY_ON_DISCONNECT,
1713                                 &config->runtime_flags);
1714                         put_dev = true;
1715                 }
1716                 if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
1717                         set_bit(NBD_DISCONNECT_ON_CLOSE,
1718                                 &config->runtime_flags);
1719                 }
1720         }
1721
1722         if (info->attrs[NBD_ATTR_SOCKETS]) {
1723                 struct nlattr *attr;
1724                 int rem, fd;
1725
1726                 nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1727                                     rem) {
1728                         struct nlattr *socks[NBD_SOCK_MAX+1];
1729
1730                         if (nla_type(attr) != NBD_SOCK_ITEM) {
1731                                 printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1732                                 ret = -EINVAL;
1733                                 goto out;
1734                         }
1735                         ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
1736                                                nbd_sock_policy, info->extack);
1737                         if (ret != 0) {
1738                                 printk(KERN_ERR "nbd: error processing sock list\n");
1739                                 ret = -EINVAL;
1740                                 goto out;
1741                         }
1742                         if (!socks[NBD_SOCK_FD])
1743                                 continue;
1744                         fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
1745                         ret = nbd_add_socket(nbd, fd, true);
1746                         if (ret)
1747                                 goto out;
1748                 }
1749         }
1750         ret = nbd_start_device(nbd);
1751 out:
1752         mutex_unlock(&nbd->config_lock);
1753         if (!ret) {
1754                 set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
1755                 refcount_inc(&nbd->config_refs);
1756                 nbd_connect_reply(info, nbd->index);
1757         }
1758         nbd_config_put(nbd);
1759         if (put_dev)
1760                 nbd_put(nbd);
1761         return ret;
1762 }
1763
1764 static void nbd_disconnect_and_put(struct nbd_device *nbd)
1765 {
1766         mutex_lock(&nbd->config_lock);
1767         nbd_disconnect(nbd);
1768         nbd_clear_sock(nbd);
1769         mutex_unlock(&nbd->config_lock);
1770         if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
1771                                &nbd->config->runtime_flags))
1772                 nbd_config_put(nbd);
1773 }
1774
1775 static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
1776 {
1777         struct nbd_device *nbd;
1778         int index;
1779
1780         if (!netlink_capable(skb, CAP_SYS_ADMIN))
1781                 return -EPERM;
1782
1783         if (!info->attrs[NBD_ATTR_INDEX]) {
1784                 printk(KERN_ERR "nbd: must specify an index to disconnect\n");
1785                 return -EINVAL;
1786         }
1787         index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1788         mutex_lock(&nbd_index_mutex);
1789         nbd = idr_find(&nbd_index_idr, index);
1790         if (!nbd) {
1791                 mutex_unlock(&nbd_index_mutex);
1792                 printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1793                        index);
1794                 return -EINVAL;
1795         }
1796         if (!refcount_inc_not_zero(&nbd->refs)) {
1797                 mutex_unlock(&nbd_index_mutex);
1798                 printk(KERN_ERR "nbd: device at index %d is going down\n",
1799                        index);
1800                 return -EINVAL;
1801         }
1802         mutex_unlock(&nbd_index_mutex);
1803         if (!refcount_inc_not_zero(&nbd->config_refs)) {
1804                 nbd_put(nbd);
1805                 return 0;
1806         }
1807         nbd_disconnect_and_put(nbd);
1808         nbd_config_put(nbd);
1809         nbd_put(nbd);
1810         return 0;
1811 }
1812
1813 static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
1814 {
1815         struct nbd_device *nbd = NULL;
1816         struct nbd_config *config;
1817         int index;
1818         int ret = 0;
1819         bool put_dev = false;
1820
1821         if (!netlink_capable(skb, CAP_SYS_ADMIN))
1822                 return -EPERM;
1823
1824         if (!info->attrs[NBD_ATTR_INDEX]) {
1825                 printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
1826                 return -EINVAL;
1827         }
1828         index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1829         mutex_lock(&nbd_index_mutex);
1830         nbd = idr_find(&nbd_index_idr, index);
1831         if (!nbd) {
1832                 mutex_unlock(&nbd_index_mutex);
1833                 printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
1834                        index);
1835                 return -EINVAL;
1836         }
1837         if (!refcount_inc_not_zero(&nbd->refs)) {
1838                 mutex_unlock(&nbd_index_mutex);
1839                 printk(KERN_ERR "nbd: device at index %d is going down\n",
1840                        index);
1841                 return -EINVAL;
1842         }
1843         mutex_unlock(&nbd_index_mutex);
1844
1845         if (!refcount_inc_not_zero(&nbd->config_refs)) {
1846                 dev_err(nbd_to_dev(nbd),
1847                         "not configured, cannot reconfigure\n");
1848                 nbd_put(nbd);
1849                 return -EINVAL;
1850         }
1851
1852         mutex_lock(&nbd->config_lock);
1853         config = nbd->config;
1854         if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
1855             !nbd->task_recv) {
1856                 dev_err(nbd_to_dev(nbd),
1857                         "not configured, cannot reconfigure\n");
1858                 ret = -EINVAL;
1859                 goto out;
1860         }
1861
1862         if (info->attrs[NBD_ATTR_TIMEOUT]) {
1863                 u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
1864                 nbd->tag_set.timeout = timeout * HZ;
1865                 blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1866         }
1867         if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1868                 config->dead_conn_timeout =
1869                         nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1870                 config->dead_conn_timeout *= HZ;
1871         }
1872         if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1873                 u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1874                 if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1875                         if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
1876                                               &config->runtime_flags))
1877                                 put_dev = true;
1878                 } else {
1879                         if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
1880                                                &config->runtime_flags))
1881                                 refcount_inc(&nbd->refs);
1882                 }
1883
1884                 if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
1885                         set_bit(NBD_DISCONNECT_ON_CLOSE,
1886                                         &config->runtime_flags);
1887                 } else {
1888                         clear_bit(NBD_DISCONNECT_ON_CLOSE,
1889                                         &config->runtime_flags);
1890                 }
1891         }
1892
1893         if (info->attrs[NBD_ATTR_SOCKETS]) {
1894                 struct nlattr *attr;
1895                 int rem, fd;
1896
1897                 nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1898                                     rem) {
1899                         struct nlattr *socks[NBD_SOCK_MAX+1];
1900
1901                         if (nla_type(attr) != NBD_SOCK_ITEM) {
1902                                 printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1903                                 ret = -EINVAL;
1904                                 goto out;
1905                         }
1906                         ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
1907                                                nbd_sock_policy, info->extack);
1908                         if (ret != 0) {
1909                                 printk(KERN_ERR "nbd: error processing sock list\n");
1910                                 ret = -EINVAL;
1911                                 goto out;
1912                         }
1913                         if (!socks[NBD_SOCK_FD])
1914                                 continue;
1915                         fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
1916                         ret = nbd_reconnect_socket(nbd, fd);
1917                         if (ret) {
1918                                 if (ret == -ENOSPC)
1919                                         ret = 0;
1920                                 goto out;
1921                         }
1922                         dev_info(nbd_to_dev(nbd), "reconnected socket\n");
1923                 }
1924         }
1925 out:
1926         mutex_unlock(&nbd->config_lock);
1927         nbd_config_put(nbd);
1928         nbd_put(nbd);
1929         if (put_dev)
1930                 nbd_put(nbd);
1931         return ret;
1932 }
1933
1934 static const struct genl_ops nbd_connect_genl_ops[] = {
1935         {
1936                 .cmd    = NBD_CMD_CONNECT,
1937                 .policy = nbd_attr_policy,
1938                 .doit   = nbd_genl_connect,
1939         },
1940         {
1941                 .cmd    = NBD_CMD_DISCONNECT,
1942                 .policy = nbd_attr_policy,
1943                 .doit   = nbd_genl_disconnect,
1944         },
1945         {
1946                 .cmd    = NBD_CMD_RECONFIGURE,
1947                 .policy = nbd_attr_policy,
1948                 .doit   = nbd_genl_reconfigure,
1949         },
1950         {
1951                 .cmd    = NBD_CMD_STATUS,
1952                 .policy = nbd_attr_policy,
1953                 .doit   = nbd_genl_status,
1954         },
1955 };
1956
1957 static const struct genl_multicast_group nbd_mcast_grps[] = {
1958         { .name = NBD_GENL_MCAST_GROUP_NAME, },
1959 };
1960
1961 static struct genl_family nbd_genl_family __ro_after_init = {
1962         .hdrsize        = 0,
1963         .name           = NBD_GENL_FAMILY_NAME,
1964         .version        = NBD_GENL_VERSION,
1965         .module         = THIS_MODULE,
1966         .ops            = nbd_connect_genl_ops,
1967         .n_ops          = ARRAY_SIZE(nbd_connect_genl_ops),
1968         .maxattr        = NBD_ATTR_MAX,
1969         .mcgrps         = nbd_mcast_grps,
1970         .n_mcgrps       = ARRAY_SIZE(nbd_mcast_grps),
1971 };
1972
1973 static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
1974 {
1975         struct nlattr *dev_opt;
1976         u8 connected = 0;
1977         int ret;
1978
1979         /* This is a little racey, but for status it's ok.  The
1980          * reason we don't take a ref here is because we can't
1981          * take a ref in the index == -1 case as we would need
1982          * to put under the nbd_index_mutex, which could
1983          * deadlock if we are configured to remove ourselves
1984          * once we're disconnected.
1985          */
1986         if (refcount_read(&nbd->config_refs))
1987                 connected = 1;
1988         dev_opt = nla_nest_start(reply, NBD_DEVICE_ITEM);
1989         if (!dev_opt)
1990                 return -EMSGSIZE;
1991         ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
1992         if (ret)
1993                 return -EMSGSIZE;
1994         ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
1995                          connected);
1996         if (ret)
1997                 return -EMSGSIZE;
1998         nla_nest_end(reply, dev_opt);
1999         return 0;
2000 }
2001
2002 static int status_cb(int id, void *ptr, void *data)
2003 {
2004         struct nbd_device *nbd = ptr;
2005         return populate_nbd_status(nbd, (struct sk_buff *)data);
2006 }
2007
2008 static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
2009 {
2010         struct nlattr *dev_list;
2011         struct sk_buff *reply;
2012         void *reply_head;
2013         size_t msg_size;
2014         int index = -1;
2015         int ret = -ENOMEM;
2016
2017         if (info->attrs[NBD_ATTR_INDEX])
2018                 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2019
2020         mutex_lock(&nbd_index_mutex);
2021
2022         msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
2023                                   nla_attr_size(sizeof(u8)));
2024         msg_size *= (index == -1) ? nbd_total_devices : 1;
2025
2026         reply = genlmsg_new(msg_size, GFP_KERNEL);
2027         if (!reply)
2028                 goto out;
2029         reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
2030                                        NBD_CMD_STATUS);
2031         if (!reply_head) {
2032                 nlmsg_free(reply);
2033                 goto out;
2034         }
2035
2036         dev_list = nla_nest_start(reply, NBD_ATTR_DEVICE_LIST);
2037         if (index == -1) {
2038                 ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
2039                 if (ret) {
2040                         nlmsg_free(reply);
2041                         goto out;
2042                 }
2043         } else {
2044                 struct nbd_device *nbd;
2045                 nbd = idr_find(&nbd_index_idr, index);
2046                 if (nbd) {
2047                         ret = populate_nbd_status(nbd, reply);
2048                         if (ret) {
2049                                 nlmsg_free(reply);
2050                                 goto out;
2051                         }
2052                 }
2053         }
2054         nla_nest_end(reply, dev_list);
2055         genlmsg_end(reply, reply_head);
2056         genlmsg_reply(reply, info);
2057         ret = 0;
2058 out:
2059         mutex_unlock(&nbd_index_mutex);
2060         return ret;
2061 }
2062
2063 static void nbd_connect_reply(struct genl_info *info, int index)
2064 {
2065         struct sk_buff *skb;
2066         void *msg_head;
2067         int ret;
2068
2069         skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2070         if (!skb)
2071                 return;
2072         msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
2073                                      NBD_CMD_CONNECT);
2074         if (!msg_head) {
2075                 nlmsg_free(skb);
2076                 return;
2077         }
2078         ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2079         if (ret) {
2080                 nlmsg_free(skb);
2081                 return;
2082         }
2083         genlmsg_end(skb, msg_head);
2084         genlmsg_reply(skb, info);
2085 }
2086
2087 static void nbd_mcast_index(int index)
2088 {
2089         struct sk_buff *skb;
2090         void *msg_head;
2091         int ret;
2092
2093         skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2094         if (!skb)
2095                 return;
2096         msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
2097                                      NBD_CMD_LINK_DEAD);
2098         if (!msg_head) {
2099                 nlmsg_free(skb);
2100                 return;
2101         }
2102         ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2103         if (ret) {
2104                 nlmsg_free(skb);
2105                 return;
2106         }
2107         genlmsg_end(skb, msg_head);
2108         genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
2109 }
2110
2111 static void nbd_dead_link_work(struct work_struct *work)
2112 {
2113         struct link_dead_args *args = container_of(work, struct link_dead_args,
2114                                                    work);
2115         nbd_mcast_index(args->index);
2116         kfree(args);
2117 }
2118
2119 static int __init nbd_init(void)
2120 {
2121         int i;
2122
2123         BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
2124
2125         if (max_part < 0) {
2126                 printk(KERN_ERR "nbd: max_part must be >= 0\n");
2127                 return -EINVAL;
2128         }
2129
2130         part_shift = 0;
2131         if (max_part > 0) {
2132                 part_shift = fls(max_part);
2133
2134                 /*
2135                  * Adjust max_part according to part_shift as it is exported
2136                  * to user space so that user can know the max number of
2137                  * partition kernel should be able to manage.
2138                  *
2139                  * Note that -1 is required because partition 0 is reserved
2140                  * for the whole disk.
2141                  */
2142                 max_part = (1UL << part_shift) - 1;
2143         }
2144
2145         if ((1UL << part_shift) > DISK_MAX_PARTS)
2146                 return -EINVAL;
2147
2148         if (nbds_max > 1UL << (MINORBITS - part_shift))
2149                 return -EINVAL;
2150         recv_workqueue = alloc_workqueue("knbd-recv",
2151                                          WQ_MEM_RECLAIM | WQ_HIGHPRI |
2152                                          WQ_UNBOUND, 0);
2153         if (!recv_workqueue)
2154                 return -ENOMEM;
2155
2156         if (register_blkdev(NBD_MAJOR, "nbd")) {
2157                 destroy_workqueue(recv_workqueue);
2158                 return -EIO;
2159         }
2160
2161         if (genl_register_family(&nbd_genl_family)) {
2162                 unregister_blkdev(NBD_MAJOR, "nbd");
2163                 destroy_workqueue(recv_workqueue);
2164                 return -EINVAL;
2165         }
2166         nbd_dbg_init();
2167
2168         mutex_lock(&nbd_index_mutex);
2169         for (i = 0; i < nbds_max; i++)
2170                 nbd_dev_add(i);
2171         mutex_unlock(&nbd_index_mutex);
2172         return 0;
2173 }
2174
2175 static int nbd_exit_cb(int id, void *ptr, void *data)
2176 {
2177         struct list_head *list = (struct list_head *)data;
2178         struct nbd_device *nbd = ptr;
2179
2180         list_add_tail(&nbd->list, list);
2181         return 0;
2182 }
2183
2184 static void __exit nbd_cleanup(void)
2185 {
2186         struct nbd_device *nbd;
2187         LIST_HEAD(del_list);
2188
2189         nbd_dbg_close();
2190
2191         mutex_lock(&nbd_index_mutex);
2192         idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
2193         mutex_unlock(&nbd_index_mutex);
2194
2195         while (!list_empty(&del_list)) {
2196                 nbd = list_first_entry(&del_list, struct nbd_device, list);
2197                 list_del_init(&nbd->list);
2198                 if (refcount_read(&nbd->refs) != 1)
2199                         printk(KERN_ERR "nbd: possibly leaking a device\n");
2200                 nbd_put(nbd);
2201         }
2202
2203         idr_destroy(&nbd_index_idr);
2204         genl_unregister_family(&nbd_genl_family);
2205         destroy_workqueue(recv_workqueue);
2206         unregister_blkdev(NBD_MAJOR, "nbd");
2207 }
2208
2209 module_init(nbd_init);
2210 module_exit(nbd_cleanup);
2211
2212 MODULE_DESCRIPTION("Network Block Device");
2213 MODULE_LICENSE("GPL");
2214
2215 module_param(nbds_max, int, 0444);
2216 MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
2217 module_param(max_part, int, 0444);
2218 MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");