Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/hid/hid
[linux-2.6-microblaze.git] / drivers / block / rnbd / rnbd-clt.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * RDMA Network Block Driver
4  *
5  * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
6  * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
7  * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
8  */
9
10 #undef pr_fmt
11 #define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
12
13 #include <linux/module.h>
14 #include <linux/blkdev.h>
15 #include <linux/hdreg.h>
16 #include <linux/scatterlist.h>
17 #include <linux/idr.h>
18
19 #include "rnbd-clt.h"
20
21 MODULE_DESCRIPTION("RDMA Network Block Device Client");
22 MODULE_LICENSE("GPL");
23
24 static int rnbd_client_major;
25 static DEFINE_IDA(index_ida);
26 static DEFINE_MUTEX(ida_lock);
27 static DEFINE_MUTEX(sess_lock);
28 static LIST_HEAD(sess_list);
29
30 /*
31  * Maximum number of partitions an instance can have.
32  * 6 bits = 64 minors = 63 partitions (one minor is used for the device itself)
33  */
34 #define RNBD_PART_BITS          6
35
36 static inline bool rnbd_clt_get_sess(struct rnbd_clt_session *sess)
37 {
38         return refcount_inc_not_zero(&sess->refcount);
39 }
40
41 static void free_sess(struct rnbd_clt_session *sess);
42
43 static void rnbd_clt_put_sess(struct rnbd_clt_session *sess)
44 {
45         might_sleep();
46
47         if (refcount_dec_and_test(&sess->refcount))
48                 free_sess(sess);
49 }
50
51 static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev)
52 {
53         might_sleep();
54
55         if (!refcount_dec_and_test(&dev->refcount))
56                 return;
57
58         mutex_lock(&ida_lock);
59         ida_simple_remove(&index_ida, dev->clt_device_id);
60         mutex_unlock(&ida_lock);
61         kfree(dev->hw_queues);
62         rnbd_clt_put_sess(dev->sess);
63         mutex_destroy(&dev->lock);
64         kfree(dev);
65 }
66
67 static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev)
68 {
69         return refcount_inc_not_zero(&dev->refcount);
70 }
71
72 static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev,
73                                  const struct rnbd_msg_open_rsp *rsp)
74 {
75         struct rnbd_clt_session *sess = dev->sess;
76
77         if (!rsp->logical_block_size)
78                 return -EINVAL;
79
80         dev->device_id              = le32_to_cpu(rsp->device_id);
81         dev->nsectors               = le64_to_cpu(rsp->nsectors);
82         dev->logical_block_size     = le16_to_cpu(rsp->logical_block_size);
83         dev->physical_block_size    = le16_to_cpu(rsp->physical_block_size);
84         dev->max_write_same_sectors = le32_to_cpu(rsp->max_write_same_sectors);
85         dev->max_discard_sectors    = le32_to_cpu(rsp->max_discard_sectors);
86         dev->discard_granularity    = le32_to_cpu(rsp->discard_granularity);
87         dev->discard_alignment      = le32_to_cpu(rsp->discard_alignment);
88         dev->secure_discard         = le16_to_cpu(rsp->secure_discard);
89         dev->rotational             = rsp->rotational;
90
91         dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE;
92         dev->max_segments = BMAX_SEGMENTS;
93
94         dev->max_hw_sectors = min_t(u32, dev->max_hw_sectors,
95                                     le32_to_cpu(rsp->max_hw_sectors));
96         dev->max_segments = min_t(u16, dev->max_segments,
97                                   le16_to_cpu(rsp->max_segments));
98
99         return 0;
100 }
101
102 static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
103                                     size_t new_nsectors)
104 {
105         int err = 0;
106
107         rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n",
108                        dev->nsectors, new_nsectors);
109         dev->nsectors = new_nsectors;
110         set_capacity(dev->gd, dev->nsectors);
111         err = revalidate_disk(dev->gd);
112         if (err)
113                 rnbd_clt_err(dev,
114                               "Failed to change device size from %zu to %zu, err: %d\n",
115                               dev->nsectors, new_nsectors, err);
116         return err;
117 }
118
119 static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
120                                 struct rnbd_msg_open_rsp *rsp)
121 {
122         int err = 0;
123
124         mutex_lock(&dev->lock);
125         if (dev->dev_state == DEV_STATE_UNMAPPED) {
126                 rnbd_clt_info(dev,
127                                "Ignoring Open-Response message from server for  unmapped device\n");
128                 err = -ENOENT;
129                 goto out;
130         }
131         if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) {
132                 u64 nsectors = le64_to_cpu(rsp->nsectors);
133
134                 /*
135                  * If the device was remapped and the size changed in the
136                  * meantime we need to revalidate it
137                  */
138                 if (dev->nsectors != nsectors)
139                         rnbd_clt_change_capacity(dev, nsectors);
140                 rnbd_clt_info(dev, "Device online, device remapped successfully\n");
141         }
142         err = rnbd_clt_set_dev_attr(dev, rsp);
143         if (err)
144                 goto out;
145         dev->dev_state = DEV_STATE_MAPPED;
146
147 out:
148         mutex_unlock(&dev->lock);
149
150         return err;
151 }
152
153 int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize)
154 {
155         int ret = 0;
156
157         mutex_lock(&dev->lock);
158         if (dev->dev_state != DEV_STATE_MAPPED) {
159                 pr_err("Failed to set new size of the device, device is not opened\n");
160                 ret = -ENOENT;
161                 goto out;
162         }
163         ret = rnbd_clt_change_capacity(dev, newsize);
164
165 out:
166         mutex_unlock(&dev->lock);
167
168         return ret;
169 }
170
171 static inline void rnbd_clt_dev_requeue(struct rnbd_queue *q)
172 {
173         if (WARN_ON(!q->hctx))
174                 return;
175
176         /* We can come here from interrupt, thus async=true */
177         blk_mq_run_hw_queue(q->hctx, true);
178 }
179
180 enum {
181         RNBD_DELAY_IFBUSY = -1,
182 };
183
184 /**
185  * rnbd_get_cpu_qlist() - finds a list with HW queues to be rerun
186  * @sess:       Session to find a queue for
187  * @cpu:        Cpu to start the search from
188  *
189  * Description:
190  *     Each CPU has a list of HW queues, which needs to be rerun.  If a list
191  *     is not empty - it is marked with a bit.  This function finds first
192  *     set bit in a bitmap and returns corresponding CPU list.
193  */
194 static struct rnbd_cpu_qlist *
195 rnbd_get_cpu_qlist(struct rnbd_clt_session *sess, int cpu)
196 {
197         int bit;
198
199         /* Search from cpu to nr_cpu_ids */
200         bit = find_next_bit(sess->cpu_queues_bm, nr_cpu_ids, cpu);
201         if (bit < nr_cpu_ids) {
202                 return per_cpu_ptr(sess->cpu_queues, bit);
203         } else if (cpu != 0) {
204                 /* Search from 0 to cpu */
205                 bit = find_next_bit(sess->cpu_queues_bm, cpu, 0);
206                 if (bit < cpu)
207                         return per_cpu_ptr(sess->cpu_queues, bit);
208         }
209
210         return NULL;
211 }
212
213 static inline int nxt_cpu(int cpu)
214 {
215         return (cpu + 1) % nr_cpu_ids;
216 }
217
218 /**
219  * rnbd_rerun_if_needed() - rerun next queue marked as stopped
220  * @sess:       Session to rerun a queue on
221  *
222  * Description:
223  *     Each CPU has it's own list of HW queues, which should be rerun.
224  *     Function finds such list with HW queues, takes a list lock, picks up
225  *     the first HW queue out of the list and requeues it.
226  *
227  * Return:
228  *     True if the queue was requeued, false otherwise.
229  *
230  * Context:
231  *     Does not matter.
232  */
233 static bool rnbd_rerun_if_needed(struct rnbd_clt_session *sess)
234 {
235         struct rnbd_queue *q = NULL;
236         struct rnbd_cpu_qlist *cpu_q;
237         unsigned long flags;
238         int *cpup;
239
240         /*
241          * To keep fairness and not to let other queues starve we always
242          * try to wake up someone else in round-robin manner.  That of course
243          * increases latency but queues always have a chance to be executed.
244          */
245         cpup = get_cpu_ptr(sess->cpu_rr);
246         for (cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(*cpup)); cpu_q;
247              cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(cpu_q->cpu))) {
248                 if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags))
249                         continue;
250                 if (unlikely(!test_bit(cpu_q->cpu, sess->cpu_queues_bm)))
251                         goto unlock;
252                 q = list_first_entry_or_null(&cpu_q->requeue_list,
253                                              typeof(*q), requeue_list);
254                 if (WARN_ON(!q))
255                         goto clear_bit;
256                 list_del_init(&q->requeue_list);
257                 clear_bit_unlock(0, &q->in_list);
258
259                 if (list_empty(&cpu_q->requeue_list)) {
260                         /* Clear bit if nothing is left */
261 clear_bit:
262                         clear_bit(cpu_q->cpu, sess->cpu_queues_bm);
263                 }
264 unlock:
265                 spin_unlock_irqrestore(&cpu_q->requeue_lock, flags);
266
267                 if (q)
268                         break;
269         }
270
271         /**
272          * Saves the CPU that is going to be requeued on the per-cpu var. Just
273          * incrementing it doesn't work because rnbd_get_cpu_qlist() will
274          * always return the first CPU with something on the queue list when the
275          * value stored on the var is greater than the last CPU with something
276          * on the list.
277          */
278         if (cpu_q)
279                 *cpup = cpu_q->cpu;
280         put_cpu_var(sess->cpu_rr);
281
282         if (q)
283                 rnbd_clt_dev_requeue(q);
284
285         return q;
286 }
287
288 /**
289  * rnbd_rerun_all_if_idle() - rerun all queues left in the list if
290  *                               session is idling (there are no requests
291  *                               in-flight).
292  * @sess:       Session to rerun the queues on
293  *
294  * Description:
295  *     This function tries to rerun all stopped queues if there are no
296  *     requests in-flight anymore.  This function tries to solve an obvious
297  *     problem, when number of tags < than number of queues (hctx), which
298  *     are stopped and put to sleep.  If last permit, which has been just put,
299  *     does not wake up all left queues (hctxs), IO requests hang forever.
300  *
301  *     That can happen when all number of permits, say N, have been exhausted
302  *     from one CPU, and we have many block devices per session, say M.
303  *     Each block device has it's own queue (hctx) for each CPU, so eventually
304  *     we can put that number of queues (hctxs) to sleep: M x nr_cpu_ids.
305  *     If number of permits N < M x nr_cpu_ids finally we will get an IO hang.
306  *
307  *     To avoid this hang last caller of rnbd_put_permit() (last caller is the
308  *     one who observes sess->busy == 0) must wake up all remaining queues.
309  *
310  * Context:
311  *     Does not matter.
312  */
313 static void rnbd_rerun_all_if_idle(struct rnbd_clt_session *sess)
314 {
315         bool requeued;
316
317         do {
318                 requeued = rnbd_rerun_if_needed(sess);
319         } while (atomic_read(&sess->busy) == 0 && requeued);
320 }
321
322 static struct rtrs_permit *rnbd_get_permit(struct rnbd_clt_session *sess,
323                                              enum rtrs_clt_con_type con_type,
324                                              int wait)
325 {
326         struct rtrs_permit *permit;
327
328         permit = rtrs_clt_get_permit(sess->rtrs, con_type,
329                                       wait ? RTRS_PERMIT_WAIT :
330                                       RTRS_PERMIT_NOWAIT);
331         if (likely(permit))
332                 /* We have a subtle rare case here, when all permits can be
333                  * consumed before busy counter increased.  This is safe,
334                  * because loser will get NULL as a permit, observe 0 busy
335                  * counter and immediately restart the queue himself.
336                  */
337                 atomic_inc(&sess->busy);
338
339         return permit;
340 }
341
342 static void rnbd_put_permit(struct rnbd_clt_session *sess,
343                              struct rtrs_permit *permit)
344 {
345         rtrs_clt_put_permit(sess->rtrs, permit);
346         atomic_dec(&sess->busy);
347         /* Paired with rnbd_clt_dev_add_to_requeue().  Decrement first
348          * and then check queue bits.
349          */
350         smp_mb__after_atomic();
351         rnbd_rerun_all_if_idle(sess);
352 }
353
354 static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess,
355                                      enum rtrs_clt_con_type con_type,
356                                      int wait)
357 {
358         struct rnbd_iu *iu;
359         struct rtrs_permit *permit;
360
361         permit = rnbd_get_permit(sess, con_type,
362                                   wait ? RTRS_PERMIT_WAIT :
363                                   RTRS_PERMIT_NOWAIT);
364         if (unlikely(!permit))
365                 return NULL;
366         iu = rtrs_permit_to_pdu(permit);
367         iu->permit = permit;
368         /*
369          * 1st reference is dropped after finishing sending a "user" message,
370          * 2nd reference is dropped after confirmation with the response is
371          * returned.
372          * 1st and 2nd can happen in any order, so the rnbd_iu should be
373          * released (rtrs_permit returned to ibbtrs) only leased after both
374          * are finished.
375          */
376         atomic_set(&iu->refcount, 2);
377         init_waitqueue_head(&iu->comp.wait);
378         iu->comp.errno = INT_MAX;
379
380         return iu;
381 }
382
383 static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu)
384 {
385         if (atomic_dec_and_test(&iu->refcount))
386                 rnbd_put_permit(sess, iu->permit);
387 }
388
389 static void rnbd_softirq_done_fn(struct request *rq)
390 {
391         struct rnbd_clt_dev *dev        = rq->rq_disk->private_data;
392         struct rnbd_clt_session *sess   = dev->sess;
393         struct rnbd_iu *iu;
394
395         iu = blk_mq_rq_to_pdu(rq);
396         rnbd_put_permit(sess, iu->permit);
397         blk_mq_end_request(rq, errno_to_blk_status(iu->errno));
398 }
399
400 static void msg_io_conf(void *priv, int errno)
401 {
402         struct rnbd_iu *iu = priv;
403         struct rnbd_clt_dev *dev = iu->dev;
404         struct request *rq = iu->rq;
405         int rw = rq_data_dir(rq);
406
407         iu->errno = errno;
408
409         blk_mq_complete_request(rq);
410
411         if (errno)
412                 rnbd_clt_info_rl(dev, "%s I/O failed with err: %d\n",
413                                  rw == READ ? "read" : "write", errno);
414 }
415
416 static void wake_up_iu_comp(struct rnbd_iu *iu, int errno)
417 {
418         iu->comp.errno = errno;
419         wake_up(&iu->comp.wait);
420 }
421
422 static void msg_conf(void *priv, int errno)
423 {
424         struct rnbd_iu *iu = priv;
425
426         iu->errno = errno;
427         schedule_work(&iu->work);
428 }
429
430 enum wait_type {
431         NO_WAIT = 0,
432         WAIT    = 1
433 };
434
435 static int send_usr_msg(struct rtrs_clt *rtrs, int dir,
436                         struct rnbd_iu *iu, struct kvec *vec, size_t nr,
437                         size_t len, struct scatterlist *sg, unsigned int sg_len,
438                         void (*conf)(struct work_struct *work),
439                         int *errno, enum wait_type wait)
440 {
441         int err;
442         struct rtrs_clt_req_ops req_ops;
443
444         INIT_WORK(&iu->work, conf);
445         req_ops = (struct rtrs_clt_req_ops) {
446                 .priv = iu,
447                 .conf_fn = msg_conf,
448         };
449         err = rtrs_clt_request(dir, &req_ops, rtrs, iu->permit,
450                                 vec, nr, len, sg, sg_len);
451         if (!err && wait) {
452                 wait_event(iu->comp.wait, iu->comp.errno != INT_MAX);
453                 *errno = iu->comp.errno;
454         } else {
455                 *errno = 0;
456         }
457
458         return err;
459 }
460
461 static void msg_close_conf(struct work_struct *work)
462 {
463         struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work);
464         struct rnbd_clt_dev *dev = iu->dev;
465
466         wake_up_iu_comp(iu, iu->errno);
467         rnbd_put_iu(dev->sess, iu);
468         rnbd_clt_put_dev(dev);
469 }
470
471 static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait)
472 {
473         struct rnbd_clt_session *sess = dev->sess;
474         struct rnbd_msg_close msg;
475         struct rnbd_iu *iu;
476         struct kvec vec = {
477                 .iov_base = &msg,
478                 .iov_len  = sizeof(msg)
479         };
480         int err, errno;
481
482         iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT);
483         if (!iu)
484                 return -ENOMEM;
485
486         iu->buf = NULL;
487         iu->dev = dev;
488
489         sg_mark_end(&iu->sglist[0]);
490
491         msg.hdr.type    = cpu_to_le16(RNBD_MSG_CLOSE);
492         msg.device_id   = cpu_to_le32(device_id);
493
494         WARN_ON(!rnbd_clt_get_dev(dev));
495         err = send_usr_msg(sess->rtrs, WRITE, iu, &vec, 1, 0, NULL, 0,
496                            msg_close_conf, &errno, wait);
497         if (err) {
498                 rnbd_clt_put_dev(dev);
499                 rnbd_put_iu(sess, iu);
500         } else {
501                 err = errno;
502         }
503
504         rnbd_put_iu(sess, iu);
505         return err;
506 }
507
508 static void msg_open_conf(struct work_struct *work)
509 {
510         struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work);
511         struct rnbd_msg_open_rsp *rsp = iu->buf;
512         struct rnbd_clt_dev *dev = iu->dev;
513         int errno = iu->errno;
514
515         if (errno) {
516                 rnbd_clt_err(dev,
517                               "Opening failed, server responded: %d\n",
518                               errno);
519         } else {
520                 errno = process_msg_open_rsp(dev, rsp);
521                 if (errno) {
522                         u32 device_id = le32_to_cpu(rsp->device_id);
523                         /*
524                          * If server thinks its fine, but we fail to process
525                          * then be nice and send a close to server.
526                          */
527                         (void)send_msg_close(dev, device_id, NO_WAIT);
528                 }
529         }
530         kfree(rsp);
531         wake_up_iu_comp(iu, errno);
532         rnbd_put_iu(dev->sess, iu);
533         rnbd_clt_put_dev(dev);
534 }
535
536 static void msg_sess_info_conf(struct work_struct *work)
537 {
538         struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work);
539         struct rnbd_msg_sess_info_rsp *rsp = iu->buf;
540         struct rnbd_clt_session *sess = iu->sess;
541
542         if (!iu->errno)
543                 sess->ver = min_t(u8, rsp->ver, RNBD_PROTO_VER_MAJOR);
544
545         kfree(rsp);
546         wake_up_iu_comp(iu, iu->errno);
547         rnbd_put_iu(sess, iu);
548         rnbd_clt_put_sess(sess);
549 }
550
551 static int send_msg_open(struct rnbd_clt_dev *dev, bool wait)
552 {
553         struct rnbd_clt_session *sess = dev->sess;
554         struct rnbd_msg_open_rsp *rsp;
555         struct rnbd_msg_open msg;
556         struct rnbd_iu *iu;
557         struct kvec vec = {
558                 .iov_base = &msg,
559                 .iov_len  = sizeof(msg)
560         };
561         int err, errno;
562
563         rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
564         if (!rsp)
565                 return -ENOMEM;
566
567         iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT);
568         if (!iu) {
569                 kfree(rsp);
570                 return -ENOMEM;
571         }
572
573         iu->buf = rsp;
574         iu->dev = dev;
575
576         sg_init_one(iu->sglist, rsp, sizeof(*rsp));
577
578         msg.hdr.type    = cpu_to_le16(RNBD_MSG_OPEN);
579         msg.access_mode = dev->access_mode;
580         strlcpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name));
581
582         WARN_ON(!rnbd_clt_get_dev(dev));
583         err = send_usr_msg(sess->rtrs, READ, iu,
584                            &vec, 1, sizeof(*rsp), iu->sglist, 1,
585                            msg_open_conf, &errno, wait);
586         if (err) {
587                 rnbd_clt_put_dev(dev);
588                 rnbd_put_iu(sess, iu);
589                 kfree(rsp);
590         } else {
591                 err = errno;
592         }
593
594         rnbd_put_iu(sess, iu);
595         return err;
596 }
597
598 static int send_msg_sess_info(struct rnbd_clt_session *sess, bool wait)
599 {
600         struct rnbd_msg_sess_info_rsp *rsp;
601         struct rnbd_msg_sess_info msg;
602         struct rnbd_iu *iu;
603         struct kvec vec = {
604                 .iov_base = &msg,
605                 .iov_len  = sizeof(msg)
606         };
607         int err, errno;
608
609         rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
610         if (!rsp)
611                 return -ENOMEM;
612
613         iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT);
614         if (!iu) {
615                 kfree(rsp);
616                 return -ENOMEM;
617         }
618
619         iu->buf = rsp;
620         iu->sess = sess;
621
622         sg_init_one(iu->sglist, rsp, sizeof(*rsp));
623
624         msg.hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO);
625         msg.ver      = RNBD_PROTO_VER_MAJOR;
626
627         if (!rnbd_clt_get_sess(sess)) {
628                 /*
629                  * That can happen only in one case, when RTRS has restablished
630                  * the connection and link_ev() is called, but session is almost
631                  * dead, last reference on session is put and caller is waiting
632                  * for RTRS to close everything.
633                  */
634                 err = -ENODEV;
635                 goto put_iu;
636         }
637         err = send_usr_msg(sess->rtrs, READ, iu,
638                            &vec, 1, sizeof(*rsp), iu->sglist, 1,
639                            msg_sess_info_conf, &errno, wait);
640         if (err) {
641                 rnbd_clt_put_sess(sess);
642 put_iu:
643                 rnbd_put_iu(sess, iu);
644                 kfree(rsp);
645         } else {
646                 err = errno;
647         }
648
649         rnbd_put_iu(sess, iu);
650         return err;
651 }
652
653 static void set_dev_states_to_disconnected(struct rnbd_clt_session *sess)
654 {
655         struct rnbd_clt_dev *dev;
656
657         mutex_lock(&sess->lock);
658         list_for_each_entry(dev, &sess->devs_list, list) {
659                 rnbd_clt_err(dev, "Device disconnected.\n");
660
661                 mutex_lock(&dev->lock);
662                 if (dev->dev_state == DEV_STATE_MAPPED)
663                         dev->dev_state = DEV_STATE_MAPPED_DISCONNECTED;
664                 mutex_unlock(&dev->lock);
665         }
666         mutex_unlock(&sess->lock);
667 }
668
669 static void remap_devs(struct rnbd_clt_session *sess)
670 {
671         struct rnbd_clt_dev *dev;
672         struct rtrs_attrs attrs;
673         int err;
674
675         /*
676          * Careful here: we are called from RTRS link event directly,
677          * thus we can't send any RTRS request and wait for response
678          * or RTRS will not be able to complete request with failure
679          * if something goes wrong (failing of outstanding requests
680          * happens exactly from the context where we are blocking now).
681          *
682          * So to avoid deadlocks each usr message sent from here must
683          * be asynchronous.
684          */
685
686         err = send_msg_sess_info(sess, NO_WAIT);
687         if (err) {
688                 pr_err("send_msg_sess_info(\"%s\"): %d\n", sess->sessname, err);
689                 return;
690         }
691
692         rtrs_clt_query(sess->rtrs, &attrs);
693         mutex_lock(&sess->lock);
694         sess->max_io_size = attrs.max_io_size;
695
696         list_for_each_entry(dev, &sess->devs_list, list) {
697                 bool skip;
698
699                 mutex_lock(&dev->lock);
700                 skip = (dev->dev_state == DEV_STATE_INIT);
701                 mutex_unlock(&dev->lock);
702                 if (skip)
703                         /*
704                          * When device is establishing connection for the first
705                          * time - do not remap, it will be closed soon.
706                          */
707                         continue;
708
709                 rnbd_clt_info(dev, "session reconnected, remapping device\n");
710                 err = send_msg_open(dev, NO_WAIT);
711                 if (err) {
712                         rnbd_clt_err(dev, "send_msg_open(): %d\n", err);
713                         break;
714                 }
715         }
716         mutex_unlock(&sess->lock);
717 }
718
719 static void rnbd_clt_link_ev(void *priv, enum rtrs_clt_link_ev ev)
720 {
721         struct rnbd_clt_session *sess = priv;
722
723         switch (ev) {
724         case RTRS_CLT_LINK_EV_DISCONNECTED:
725                 set_dev_states_to_disconnected(sess);
726                 break;
727         case RTRS_CLT_LINK_EV_RECONNECTED:
728                 remap_devs(sess);
729                 break;
730         default:
731                 pr_err("Unknown session event received (%d), session: %s\n",
732                        ev, sess->sessname);
733         }
734 }
735
736 static void rnbd_init_cpu_qlists(struct rnbd_cpu_qlist __percpu *cpu_queues)
737 {
738         unsigned int cpu;
739         struct rnbd_cpu_qlist *cpu_q;
740
741         for_each_possible_cpu(cpu) {
742                 cpu_q = per_cpu_ptr(cpu_queues, cpu);
743
744                 cpu_q->cpu = cpu;
745                 INIT_LIST_HEAD(&cpu_q->requeue_list);
746                 spin_lock_init(&cpu_q->requeue_lock);
747         }
748 }
749
750 static void destroy_mq_tags(struct rnbd_clt_session *sess)
751 {
752         if (sess->tag_set.tags)
753                 blk_mq_free_tag_set(&sess->tag_set);
754 }
755
756 static inline void wake_up_rtrs_waiters(struct rnbd_clt_session *sess)
757 {
758         sess->rtrs_ready = true;
759         wake_up_all(&sess->rtrs_waitq);
760 }
761
762 static void close_rtrs(struct rnbd_clt_session *sess)
763 {
764         might_sleep();
765
766         if (!IS_ERR_OR_NULL(sess->rtrs)) {
767                 rtrs_clt_close(sess->rtrs);
768                 sess->rtrs = NULL;
769                 wake_up_rtrs_waiters(sess);
770         }
771 }
772
773 static void free_sess(struct rnbd_clt_session *sess)
774 {
775         WARN_ON(!list_empty(&sess->devs_list));
776
777         might_sleep();
778
779         close_rtrs(sess);
780         destroy_mq_tags(sess);
781         if (!list_empty(&sess->list)) {
782                 mutex_lock(&sess_lock);
783                 list_del(&sess->list);
784                 mutex_unlock(&sess_lock);
785         }
786         free_percpu(sess->cpu_queues);
787         free_percpu(sess->cpu_rr);
788         mutex_destroy(&sess->lock);
789         kfree(sess);
790 }
791
792 static struct rnbd_clt_session *alloc_sess(const char *sessname)
793 {
794         struct rnbd_clt_session *sess;
795         int err, cpu;
796
797         sess = kzalloc_node(sizeof(*sess), GFP_KERNEL, NUMA_NO_NODE);
798         if (!sess)
799                 return ERR_PTR(-ENOMEM);
800         strlcpy(sess->sessname, sessname, sizeof(sess->sessname));
801         atomic_set(&sess->busy, 0);
802         mutex_init(&sess->lock);
803         INIT_LIST_HEAD(&sess->devs_list);
804         INIT_LIST_HEAD(&sess->list);
805         bitmap_zero(sess->cpu_queues_bm, NR_CPUS);
806         init_waitqueue_head(&sess->rtrs_waitq);
807         refcount_set(&sess->refcount, 1);
808
809         sess->cpu_queues = alloc_percpu(struct rnbd_cpu_qlist);
810         if (!sess->cpu_queues) {
811                 err = -ENOMEM;
812                 goto err;
813         }
814         rnbd_init_cpu_qlists(sess->cpu_queues);
815
816         /*
817          * That is simple percpu variable which stores cpu indeces, which are
818          * incremented on each access.  We need that for the sake of fairness
819          * to wake up queues in a round-robin manner.
820          */
821         sess->cpu_rr = alloc_percpu(int);
822         if (!sess->cpu_rr) {
823                 err = -ENOMEM;
824                 goto err;
825         }
826         for_each_possible_cpu(cpu)
827                 * per_cpu_ptr(sess->cpu_rr, cpu) = cpu;
828
829         return sess;
830
831 err:
832         free_sess(sess);
833
834         return ERR_PTR(err);
835 }
836
837 static int wait_for_rtrs_connection(struct rnbd_clt_session *sess)
838 {
839         wait_event(sess->rtrs_waitq, sess->rtrs_ready);
840         if (IS_ERR_OR_NULL(sess->rtrs))
841                 return -ECONNRESET;
842
843         return 0;
844 }
845
846 static void wait_for_rtrs_disconnection(struct rnbd_clt_session *sess)
847         __releases(&sess_lock)
848         __acquires(&sess_lock)
849 {
850         DEFINE_WAIT(wait);
851
852         prepare_to_wait(&sess->rtrs_waitq, &wait, TASK_UNINTERRUPTIBLE);
853         if (IS_ERR_OR_NULL(sess->rtrs)) {
854                 finish_wait(&sess->rtrs_waitq, &wait);
855                 return;
856         }
857         mutex_unlock(&sess_lock);
858         /* loop in caller, see __find_and_get_sess().
859          * You can't leave mutex locked and call schedule(), you will catch a
860          * deadlock with a caller of free_sess(), which has just put the last
861          * reference and is about to take the sess_lock in order to delete
862          * the session from the list.
863          */
864         schedule();
865         mutex_lock(&sess_lock);
866 }
867
868 static struct rnbd_clt_session *__find_and_get_sess(const char *sessname)
869         __releases(&sess_lock)
870         __acquires(&sess_lock)
871 {
872         struct rnbd_clt_session *sess, *sn;
873         int err;
874
875 again:
876         list_for_each_entry_safe(sess, sn, &sess_list, list) {
877                 if (strcmp(sessname, sess->sessname))
878                         continue;
879
880                 if (sess->rtrs_ready && IS_ERR_OR_NULL(sess->rtrs))
881                         /*
882                          * No RTRS connection, session is dying.
883                          */
884                         continue;
885
886                 if (rnbd_clt_get_sess(sess)) {
887                         /*
888                          * Alive session is found, wait for RTRS connection.
889                          */
890                         mutex_unlock(&sess_lock);
891                         err = wait_for_rtrs_connection(sess);
892                         if (err)
893                                 rnbd_clt_put_sess(sess);
894                         mutex_lock(&sess_lock);
895
896                         if (err)
897                                 /* Session is dying, repeat the loop */
898                                 goto again;
899
900                         return sess;
901                 }
902                 /*
903                  * Ref is 0, session is dying, wait for RTRS disconnect
904                  * in order to avoid session names clashes.
905                  */
906                 wait_for_rtrs_disconnection(sess);
907                 /*
908                  * RTRS is disconnected and soon session will be freed,
909                  * so repeat a loop.
910                  */
911                 goto again;
912         }
913
914         return NULL;
915 }
916
917 static struct
918 rnbd_clt_session *find_or_create_sess(const char *sessname, bool *first)
919 {
920         struct rnbd_clt_session *sess = NULL;
921
922         mutex_lock(&sess_lock);
923         sess = __find_and_get_sess(sessname);
924         if (!sess) {
925                 sess = alloc_sess(sessname);
926                 if (IS_ERR(sess)) {
927                         mutex_unlock(&sess_lock);
928                         return sess;
929                 }
930                 list_add(&sess->list, &sess_list);
931                 *first = true;
932         } else
933                 *first = false;
934         mutex_unlock(&sess_lock);
935
936         return sess;
937 }
938
939 static int rnbd_client_open(struct block_device *block_device, fmode_t mode)
940 {
941         struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
942
943         if (dev->read_only && (mode & FMODE_WRITE))
944                 return -EPERM;
945
946         if (dev->dev_state == DEV_STATE_UNMAPPED ||
947             !rnbd_clt_get_dev(dev))
948                 return -EIO;
949
950         return 0;
951 }
952
953 static void rnbd_client_release(struct gendisk *gen, fmode_t mode)
954 {
955         struct rnbd_clt_dev *dev = gen->private_data;
956
957         rnbd_clt_put_dev(dev);
958 }
959
960 static int rnbd_client_getgeo(struct block_device *block_device,
961                               struct hd_geometry *geo)
962 {
963         u64 size;
964         struct rnbd_clt_dev *dev;
965
966         dev = block_device->bd_disk->private_data;
967         size = dev->size * (dev->logical_block_size / SECTOR_SIZE);
968         geo->cylinders  = size >> 6;    /* size/64 */
969         geo->heads      = 4;
970         geo->sectors    = 16;
971         geo->start      = 0;
972
973         return 0;
974 }
975
976 static const struct block_device_operations rnbd_client_ops = {
977         .owner          = THIS_MODULE,
978         .open           = rnbd_client_open,
979         .release        = rnbd_client_release,
980         .getgeo         = rnbd_client_getgeo
981 };
982
983 /* The amount of data that belongs to an I/O and the amount of data that
984  * should be read or written to the disk (bi_size) can differ.
985  *
986  * E.g. When WRITE_SAME is used, only a small amount of data is
987  * transferred that is then written repeatedly over a lot of sectors.
988  *
989  * Get the size of data to be transferred via RTRS by summing up the size
990  * of the scather-gather list entries.
991  */
992 static size_t rnbd_clt_get_sg_size(struct scatterlist *sglist, u32 len)
993 {
994         struct scatterlist *sg;
995         size_t tsize = 0;
996         int i;
997
998         for_each_sg(sglist, sg, len, i)
999                 tsize += sg->length;
1000         return tsize;
1001 }
1002
1003 static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev,
1004                                      struct request *rq,
1005                                      struct rnbd_iu *iu)
1006 {
1007         struct rtrs_clt *rtrs = dev->sess->rtrs;
1008         struct rtrs_permit *permit = iu->permit;
1009         struct rnbd_msg_io msg;
1010         struct rtrs_clt_req_ops req_ops;
1011         unsigned int sg_cnt = 0;
1012         struct kvec vec;
1013         size_t size;
1014         int err;
1015
1016         iu->rq          = rq;
1017         iu->dev         = dev;
1018         msg.sector      = cpu_to_le64(blk_rq_pos(rq));
1019         msg.bi_size     = cpu_to_le32(blk_rq_bytes(rq));
1020         msg.rw          = cpu_to_le32(rq_to_rnbd_flags(rq));
1021         msg.prio        = cpu_to_le16(req_get_ioprio(rq));
1022
1023         /*
1024          * We only support discards with single segment for now.
1025          * See queue limits.
1026          */
1027         if (req_op(rq) != REQ_OP_DISCARD)
1028                 sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sglist);
1029
1030         if (sg_cnt == 0)
1031                 /* Do not forget to mark the end */
1032                 sg_mark_end(&iu->sglist[0]);
1033
1034         msg.hdr.type    = cpu_to_le16(RNBD_MSG_IO);
1035         msg.device_id   = cpu_to_le32(dev->device_id);
1036
1037         vec = (struct kvec) {
1038                 .iov_base = &msg,
1039                 .iov_len  = sizeof(msg)
1040         };
1041         size = rnbd_clt_get_sg_size(iu->sglist, sg_cnt);
1042         req_ops = (struct rtrs_clt_req_ops) {
1043                 .priv = iu,
1044                 .conf_fn = msg_io_conf,
1045         };
1046         err = rtrs_clt_request(rq_data_dir(rq), &req_ops, rtrs, permit,
1047                                &vec, 1, size, iu->sglist, sg_cnt);
1048         if (unlikely(err)) {
1049                 rnbd_clt_err_rl(dev, "RTRS failed to transfer IO, err: %d\n",
1050                                  err);
1051                 return err;
1052         }
1053
1054         return 0;
1055 }
1056
1057 /**
1058  * rnbd_clt_dev_add_to_requeue() - add device to requeue if session is busy
1059  * @dev:        Device to be checked
1060  * @q:          Queue to be added to the requeue list if required
1061  *
1062  * Description:
1063  *     If session is busy, that means someone will requeue us when resources
1064  *     are freed.  If session is not doing anything - device is not added to
1065  *     the list and @false is returned.
1066  */
1067 static bool rnbd_clt_dev_add_to_requeue(struct rnbd_clt_dev *dev,
1068                                                 struct rnbd_queue *q)
1069 {
1070         struct rnbd_clt_session *sess = dev->sess;
1071         struct rnbd_cpu_qlist *cpu_q;
1072         unsigned long flags;
1073         bool added = true;
1074         bool need_set;
1075
1076         cpu_q = get_cpu_ptr(sess->cpu_queues);
1077         spin_lock_irqsave(&cpu_q->requeue_lock, flags);
1078
1079         if (likely(!test_and_set_bit_lock(0, &q->in_list))) {
1080                 if (WARN_ON(!list_empty(&q->requeue_list)))
1081                         goto unlock;
1082
1083                 need_set = !test_bit(cpu_q->cpu, sess->cpu_queues_bm);
1084                 if (need_set) {
1085                         set_bit(cpu_q->cpu, sess->cpu_queues_bm);
1086                         /* Paired with rnbd_put_permit(). Set a bit first
1087                          * and then observe the busy counter.
1088                          */
1089                         smp_mb__before_atomic();
1090                 }
1091                 if (likely(atomic_read(&sess->busy))) {
1092                         list_add_tail(&q->requeue_list, &cpu_q->requeue_list);
1093                 } else {
1094                         /* Very unlikely, but possible: busy counter was
1095                          * observed as zero.  Drop all bits and return
1096                          * false to restart the queue by ourselves.
1097                          */
1098                         if (need_set)
1099                                 clear_bit(cpu_q->cpu, sess->cpu_queues_bm);
1100                         clear_bit_unlock(0, &q->in_list);
1101                         added = false;
1102                 }
1103         }
1104 unlock:
1105         spin_unlock_irqrestore(&cpu_q->requeue_lock, flags);
1106         put_cpu_ptr(sess->cpu_queues);
1107
1108         return added;
1109 }
1110
1111 static void rnbd_clt_dev_kick_mq_queue(struct rnbd_clt_dev *dev,
1112                                         struct blk_mq_hw_ctx *hctx,
1113                                         int delay)
1114 {
1115         struct rnbd_queue *q = hctx->driver_data;
1116
1117         if (delay != RNBD_DELAY_IFBUSY)
1118                 blk_mq_delay_run_hw_queue(hctx, delay);
1119         else if (unlikely(!rnbd_clt_dev_add_to_requeue(dev, q)))
1120                 /*
1121                  * If session is not busy we have to restart
1122                  * the queue ourselves.
1123                  */
1124                 blk_mq_delay_run_hw_queue(hctx, 10/*ms*/);
1125 }
1126
1127 static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
1128                                    const struct blk_mq_queue_data *bd)
1129 {
1130         struct request *rq = bd->rq;
1131         struct rnbd_clt_dev *dev = rq->rq_disk->private_data;
1132         struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq);
1133         int err;
1134
1135         if (unlikely(dev->dev_state != DEV_STATE_MAPPED))
1136                 return BLK_STS_IOERR;
1137
1138         iu->permit = rnbd_get_permit(dev->sess, RTRS_IO_CON,
1139                                       RTRS_PERMIT_NOWAIT);
1140         if (unlikely(!iu->permit)) {
1141                 rnbd_clt_dev_kick_mq_queue(dev, hctx, RNBD_DELAY_IFBUSY);
1142                 return BLK_STS_RESOURCE;
1143         }
1144
1145         blk_mq_start_request(rq);
1146         err = rnbd_client_xfer_request(dev, rq, iu);
1147         if (likely(err == 0))
1148                 return BLK_STS_OK;
1149         if (unlikely(err == -EAGAIN || err == -ENOMEM)) {
1150                 rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/);
1151                 rnbd_put_permit(dev->sess, iu->permit);
1152                 return BLK_STS_RESOURCE;
1153         }
1154
1155         rnbd_put_permit(dev->sess, iu->permit);
1156         return BLK_STS_IOERR;
1157 }
1158
1159 static int rnbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
1160                               unsigned int hctx_idx, unsigned int numa_node)
1161 {
1162         struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq);
1163
1164         sg_init_table(iu->sglist, BMAX_SEGMENTS);
1165         return 0;
1166 }
1167
1168 static struct blk_mq_ops rnbd_mq_ops = {
1169         .queue_rq       = rnbd_queue_rq,
1170         .init_request   = rnbd_init_request,
1171         .complete       = rnbd_softirq_done_fn,
1172 };
1173
1174 static int setup_mq_tags(struct rnbd_clt_session *sess)
1175 {
1176         struct blk_mq_tag_set *tag_set = &sess->tag_set;
1177
1178         memset(tag_set, 0, sizeof(*tag_set));
1179         tag_set->ops            = &rnbd_mq_ops;
1180         tag_set->queue_depth    = sess->queue_depth;
1181         tag_set->numa_node              = NUMA_NO_NODE;
1182         tag_set->flags          = BLK_MQ_F_SHOULD_MERGE |
1183                                   BLK_MQ_F_TAG_SHARED;
1184         tag_set->cmd_size               = sizeof(struct rnbd_iu);
1185         tag_set->nr_hw_queues   = num_online_cpus();
1186
1187         return blk_mq_alloc_tag_set(tag_set);
1188 }
1189
1190 static struct rnbd_clt_session *
1191 find_and_get_or_create_sess(const char *sessname,
1192                             const struct rtrs_addr *paths,
1193                             size_t path_cnt, u16 port_nr)
1194 {
1195         struct rnbd_clt_session *sess;
1196         struct rtrs_attrs attrs;
1197         int err;
1198         bool first;
1199         struct rtrs_clt_ops rtrs_ops;
1200
1201         sess = find_or_create_sess(sessname, &first);
1202         if (sess == ERR_PTR(-ENOMEM))
1203                 return ERR_PTR(-ENOMEM);
1204         else if (!first)
1205                 return sess;
1206
1207         rtrs_ops = (struct rtrs_clt_ops) {
1208                 .priv = sess,
1209                 .link_ev = rnbd_clt_link_ev,
1210         };
1211         /*
1212          * Nothing was found, establish rtrs connection and proceed further.
1213          */
1214         sess->rtrs = rtrs_clt_open(&rtrs_ops, sessname,
1215                                    paths, path_cnt, port_nr,
1216                                    sizeof(struct rnbd_iu),
1217                                    RECONNECT_DELAY, BMAX_SEGMENTS,
1218                                    BLK_MAX_SEGMENT_SIZE,
1219                                    MAX_RECONNECTS);
1220         if (IS_ERR(sess->rtrs)) {
1221                 err = PTR_ERR(sess->rtrs);
1222                 goto wake_up_and_put;
1223         }
1224         rtrs_clt_query(sess->rtrs, &attrs);
1225         sess->max_io_size = attrs.max_io_size;
1226         sess->queue_depth = attrs.queue_depth;
1227
1228         err = setup_mq_tags(sess);
1229         if (err)
1230                 goto close_rtrs;
1231
1232         err = send_msg_sess_info(sess, WAIT);
1233         if (err)
1234                 goto close_rtrs;
1235
1236         wake_up_rtrs_waiters(sess);
1237
1238         return sess;
1239
1240 close_rtrs:
1241         close_rtrs(sess);
1242 put_sess:
1243         rnbd_clt_put_sess(sess);
1244
1245         return ERR_PTR(err);
1246
1247 wake_up_and_put:
1248         wake_up_rtrs_waiters(sess);
1249         goto put_sess;
1250 }
1251
1252 static inline void rnbd_init_hw_queue(struct rnbd_clt_dev *dev,
1253                                        struct rnbd_queue *q,
1254                                        struct blk_mq_hw_ctx *hctx)
1255 {
1256         INIT_LIST_HEAD(&q->requeue_list);
1257         q->dev  = dev;
1258         q->hctx = hctx;
1259 }
1260
1261 static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev)
1262 {
1263         int i;
1264         struct blk_mq_hw_ctx *hctx;
1265         struct rnbd_queue *q;
1266
1267         queue_for_each_hw_ctx(dev->queue, hctx, i) {
1268                 q = &dev->hw_queues[i];
1269                 rnbd_init_hw_queue(dev, q, hctx);
1270                 hctx->driver_data = q;
1271         }
1272 }
1273
1274 static int setup_mq_dev(struct rnbd_clt_dev *dev)
1275 {
1276         dev->queue = blk_mq_init_queue(&dev->sess->tag_set);
1277         if (IS_ERR(dev->queue)) {
1278                 rnbd_clt_err(dev, "Initializing multiqueue queue failed, err: %ld\n",
1279                               PTR_ERR(dev->queue));
1280                 return PTR_ERR(dev->queue);
1281         }
1282         rnbd_init_mq_hw_queues(dev);
1283         return 0;
1284 }
1285
1286 static void setup_request_queue(struct rnbd_clt_dev *dev)
1287 {
1288         blk_queue_logical_block_size(dev->queue, dev->logical_block_size);
1289         blk_queue_physical_block_size(dev->queue, dev->physical_block_size);
1290         blk_queue_max_hw_sectors(dev->queue, dev->max_hw_sectors);
1291         blk_queue_max_write_same_sectors(dev->queue,
1292                                          dev->max_write_same_sectors);
1293
1294         /*
1295          * we don't support discards to "discontiguous" segments
1296          * in on request
1297          */
1298         blk_queue_max_discard_segments(dev->queue, 1);
1299
1300         blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors);
1301         dev->queue->limits.discard_granularity  = dev->discard_granularity;
1302         dev->queue->limits.discard_alignment    = dev->discard_alignment;
1303         if (dev->max_discard_sectors)
1304                 blk_queue_flag_set(QUEUE_FLAG_DISCARD, dev->queue);
1305         if (dev->secure_discard)
1306                 blk_queue_flag_set(QUEUE_FLAG_SECERASE, dev->queue);
1307
1308         blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
1309         blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
1310         blk_queue_max_segments(dev->queue, dev->max_segments);
1311         blk_queue_io_opt(dev->queue, dev->sess->max_io_size);
1312         blk_queue_virt_boundary(dev->queue, SZ_4K - 1);
1313         blk_queue_write_cache(dev->queue, true, true);
1314         dev->queue->queuedata = dev;
1315 }
1316
1317 static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx)
1318 {
1319         dev->gd->major          = rnbd_client_major;
1320         dev->gd->first_minor    = idx << RNBD_PART_BITS;
1321         dev->gd->fops           = &rnbd_client_ops;
1322         dev->gd->queue          = dev->queue;
1323         dev->gd->private_data   = dev;
1324         snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d",
1325                  idx);
1326         pr_debug("disk_name=%s, capacity=%zu\n",
1327                  dev->gd->disk_name,
1328                  dev->nsectors * (dev->logical_block_size / SECTOR_SIZE)
1329                  );
1330
1331         set_capacity(dev->gd, dev->nsectors);
1332
1333         if (dev->access_mode == RNBD_ACCESS_RO) {
1334                 dev->read_only = true;
1335                 set_disk_ro(dev->gd, true);
1336         } else {
1337                 dev->read_only = false;
1338         }
1339
1340         if (!dev->rotational)
1341                 blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue);
1342 }
1343
1344 static int rnbd_client_setup_device(struct rnbd_clt_session *sess,
1345                                      struct rnbd_clt_dev *dev, int idx)
1346 {
1347         int err;
1348
1349         dev->size = dev->nsectors * dev->logical_block_size;
1350
1351         err = setup_mq_dev(dev);
1352         if (err)
1353                 return err;
1354
1355         setup_request_queue(dev);
1356
1357         dev->gd = alloc_disk_node(1 << RNBD_PART_BITS,  NUMA_NO_NODE);
1358         if (!dev->gd) {
1359                 blk_cleanup_queue(dev->queue);
1360                 return -ENOMEM;
1361         }
1362
1363         rnbd_clt_setup_gen_disk(dev, idx);
1364
1365         return 0;
1366 }
1367
1368 static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess,
1369                                       enum rnbd_access_mode access_mode,
1370                                       const char *pathname)
1371 {
1372         struct rnbd_clt_dev *dev;
1373         int ret;
1374
1375         dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, NUMA_NO_NODE);
1376         if (!dev)
1377                 return ERR_PTR(-ENOMEM);
1378
1379         dev->hw_queues = kcalloc(nr_cpu_ids, sizeof(*dev->hw_queues),
1380                                  GFP_KERNEL);
1381         if (!dev->hw_queues) {
1382                 ret = -ENOMEM;
1383                 goto out_alloc;
1384         }
1385
1386         mutex_lock(&ida_lock);
1387         ret = ida_simple_get(&index_ida, 0, 1 << (MINORBITS - RNBD_PART_BITS),
1388                              GFP_KERNEL);
1389         mutex_unlock(&ida_lock);
1390         if (ret < 0) {
1391                 pr_err("Failed to initialize device '%s' from session %s, allocating idr failed, err: %d\n",
1392                        pathname, sess->sessname, ret);
1393                 goto out_queues;
1394         }
1395         dev->clt_device_id      = ret;
1396         dev->sess               = sess;
1397         dev->access_mode        = access_mode;
1398         strlcpy(dev->pathname, pathname, sizeof(dev->pathname));
1399         mutex_init(&dev->lock);
1400         refcount_set(&dev->refcount, 1);
1401         dev->dev_state = DEV_STATE_INIT;
1402
1403         /*
1404          * Here we called from sysfs entry, thus clt-sysfs is
1405          * responsible that session will not disappear.
1406          */
1407         WARN_ON(!rnbd_clt_get_sess(sess));
1408
1409         return dev;
1410
1411 out_queues:
1412         kfree(dev->hw_queues);
1413 out_alloc:
1414         kfree(dev);
1415         return ERR_PTR(ret);
1416 }
1417
1418 static bool __exists_dev(const char *pathname)
1419 {
1420         struct rnbd_clt_session *sess;
1421         struct rnbd_clt_dev *dev;
1422         bool found = false;
1423
1424         list_for_each_entry(sess, &sess_list, list) {
1425                 mutex_lock(&sess->lock);
1426                 list_for_each_entry(dev, &sess->devs_list, list) {
1427                         if (!strncmp(dev->pathname, pathname,
1428                                      sizeof(dev->pathname))) {
1429                                 found = true;
1430                                 break;
1431                         }
1432                 }
1433                 mutex_unlock(&sess->lock);
1434                 if (found)
1435                         break;
1436         }
1437
1438         return found;
1439 }
1440
1441 static bool exists_devpath(const char *pathname)
1442 {
1443         bool found;
1444
1445         mutex_lock(&sess_lock);
1446         found = __exists_dev(pathname);
1447         mutex_unlock(&sess_lock);
1448
1449         return found;
1450 }
1451
1452 static bool insert_dev_if_not_exists_devpath(const char *pathname,
1453                                              struct rnbd_clt_session *sess,
1454                                              struct rnbd_clt_dev *dev)
1455 {
1456         bool found;
1457
1458         mutex_lock(&sess_lock);
1459         found = __exists_dev(pathname);
1460         if (!found) {
1461                 mutex_lock(&sess->lock);
1462                 list_add_tail(&dev->list, &sess->devs_list);
1463                 mutex_unlock(&sess->lock);
1464         }
1465         mutex_unlock(&sess_lock);
1466
1467         return found;
1468 }
1469
1470 static void delete_dev(struct rnbd_clt_dev *dev)
1471 {
1472         struct rnbd_clt_session *sess = dev->sess;
1473
1474         mutex_lock(&sess->lock);
1475         list_del(&dev->list);
1476         mutex_unlock(&sess->lock);
1477 }
1478
1479 struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
1480                                            struct rtrs_addr *paths,
1481                                            size_t path_cnt, u16 port_nr,
1482                                            const char *pathname,
1483                                            enum rnbd_access_mode access_mode)
1484 {
1485         struct rnbd_clt_session *sess;
1486         struct rnbd_clt_dev *dev;
1487         int ret;
1488
1489         if (exists_devpath(pathname))
1490                 return ERR_PTR(-EEXIST);
1491
1492         sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr);
1493         if (IS_ERR(sess))
1494                 return ERR_CAST(sess);
1495
1496         dev = init_dev(sess, access_mode, pathname);
1497         if (IS_ERR(dev)) {
1498                 pr_err("map_device: failed to map device '%s' from session %s, can't initialize device, err: %ld\n",
1499                        pathname, sess->sessname, PTR_ERR(dev));
1500                 ret = PTR_ERR(dev);
1501                 goto put_sess;
1502         }
1503         if (insert_dev_if_not_exists_devpath(pathname, sess, dev)) {
1504                 ret = -EEXIST;
1505                 goto put_dev;
1506         }
1507         ret = send_msg_open(dev, WAIT);
1508         if (ret) {
1509                 rnbd_clt_err(dev,
1510                               "map_device: failed, can't open remote device, err: %d\n",
1511                               ret);
1512                 goto del_dev;
1513         }
1514         mutex_lock(&dev->lock);
1515         pr_debug("Opened remote device: session=%s, path='%s'\n",
1516                  sess->sessname, pathname);
1517         ret = rnbd_client_setup_device(sess, dev, dev->clt_device_id);
1518         if (ret) {
1519                 rnbd_clt_err(dev,
1520                               "map_device: Failed to configure device, err: %d\n",
1521                               ret);
1522                 mutex_unlock(&dev->lock);
1523                 goto del_dev;
1524         }
1525
1526         rnbd_clt_info(dev,
1527                        "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, rotational: %d)\n",
1528                        dev->gd->disk_name, dev->nsectors,
1529                        dev->logical_block_size, dev->physical_block_size,
1530                        dev->max_write_same_sectors, dev->max_discard_sectors,
1531                        dev->discard_granularity, dev->discard_alignment,
1532                        dev->secure_discard, dev->max_segments,
1533                        dev->max_hw_sectors, dev->rotational);
1534
1535         mutex_unlock(&dev->lock);
1536
1537         add_disk(dev->gd);
1538         rnbd_clt_put_sess(sess);
1539
1540         return dev;
1541
1542 del_dev:
1543         delete_dev(dev);
1544 put_dev:
1545         rnbd_clt_put_dev(dev);
1546 put_sess:
1547         rnbd_clt_put_sess(sess);
1548
1549         return ERR_PTR(ret);
1550 }
1551
1552 static void destroy_gen_disk(struct rnbd_clt_dev *dev)
1553 {
1554         del_gendisk(dev->gd);
1555         blk_cleanup_queue(dev->queue);
1556         put_disk(dev->gd);
1557 }
1558
1559 static void destroy_sysfs(struct rnbd_clt_dev *dev,
1560                           const struct attribute *sysfs_self)
1561 {
1562         rnbd_clt_remove_dev_symlink(dev);
1563         if (dev->kobj.state_initialized) {
1564                 if (sysfs_self)
1565                         /* To avoid deadlock firstly remove itself */
1566                         sysfs_remove_file_self(&dev->kobj, sysfs_self);
1567                 kobject_del(&dev->kobj);
1568                 kobject_put(&dev->kobj);
1569         }
1570 }
1571
1572 int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force,
1573                            const struct attribute *sysfs_self)
1574 {
1575         struct rnbd_clt_session *sess = dev->sess;
1576         int refcount, ret = 0;
1577         bool was_mapped;
1578
1579         mutex_lock(&dev->lock);
1580         if (dev->dev_state == DEV_STATE_UNMAPPED) {
1581                 rnbd_clt_info(dev, "Device is already being unmapped\n");
1582                 ret = -EALREADY;
1583                 goto err;
1584         }
1585         refcount = refcount_read(&dev->refcount);
1586         if (!force && refcount > 1) {
1587                 rnbd_clt_err(dev,
1588                               "Closing device failed, device is in use, (%d device users)\n",
1589                               refcount - 1);
1590                 ret = -EBUSY;
1591                 goto err;
1592         }
1593         was_mapped = (dev->dev_state == DEV_STATE_MAPPED);
1594         dev->dev_state = DEV_STATE_UNMAPPED;
1595         mutex_unlock(&dev->lock);
1596
1597         delete_dev(dev);
1598         destroy_sysfs(dev, sysfs_self);
1599         destroy_gen_disk(dev);
1600         if (was_mapped && sess->rtrs)
1601                 send_msg_close(dev, dev->device_id, WAIT);
1602
1603         rnbd_clt_info(dev, "Device is unmapped\n");
1604
1605         /* Likely last reference put */
1606         rnbd_clt_put_dev(dev);
1607
1608         /*
1609          * Here device and session can be vanished!
1610          */
1611
1612         return 0;
1613 err:
1614         mutex_unlock(&dev->lock);
1615
1616         return ret;
1617 }
1618
1619 int rnbd_clt_remap_device(struct rnbd_clt_dev *dev)
1620 {
1621         int err;
1622
1623         mutex_lock(&dev->lock);
1624         if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED)
1625                 err = 0;
1626         else if (dev->dev_state == DEV_STATE_UNMAPPED)
1627                 err = -ENODEV;
1628         else if (dev->dev_state == DEV_STATE_MAPPED)
1629                 err = -EALREADY;
1630         else
1631                 err = -EBUSY;
1632         mutex_unlock(&dev->lock);
1633         if (!err) {
1634                 rnbd_clt_info(dev, "Remapping device.\n");
1635                 err = send_msg_open(dev, WAIT);
1636                 if (err)
1637                         rnbd_clt_err(dev, "remap_device: %d\n", err);
1638         }
1639
1640         return err;
1641 }
1642
1643 static void unmap_device_work(struct work_struct *work)
1644 {
1645         struct rnbd_clt_dev *dev;
1646
1647         dev = container_of(work, typeof(*dev), unmap_on_rmmod_work);
1648         rnbd_clt_unmap_device(dev, true, NULL);
1649 }
1650
1651 static void rnbd_destroy_sessions(void)
1652 {
1653         struct rnbd_clt_session *sess, *sn;
1654         struct rnbd_clt_dev *dev, *tn;
1655
1656         /* Firstly forbid access through sysfs interface */
1657         rnbd_clt_destroy_default_group();
1658         rnbd_clt_destroy_sysfs_files();
1659
1660         /*
1661          * Here at this point there is no any concurrent access to sessions
1662          * list and devices list:
1663          *   1. New session or device can'be be created - session sysfs files
1664          *      are removed.
1665          *   2. Device or session can't be removed - module reference is taken
1666          *      into account in unmap device sysfs callback.
1667          *   3. No IO requests inflight - each file open of block_dev increases
1668          *      module reference in get_disk().
1669          *
1670          * But still there can be user requests inflights, which are sent by
1671          * asynchronous send_msg_*() functions, thus before unmapping devices
1672          * RTRS session must be explicitly closed.
1673          */
1674
1675         list_for_each_entry_safe(sess, sn, &sess_list, list) {
1676                 WARN_ON(!rnbd_clt_get_sess(sess));
1677                 close_rtrs(sess);
1678                 list_for_each_entry_safe(dev, tn, &sess->devs_list, list) {
1679                         /*
1680                          * Here unmap happens in parallel for only one reason:
1681                          * blk_cleanup_queue() takes around half a second, so
1682                          * on huge amount of devices the whole module unload
1683                          * procedure takes minutes.
1684                          */
1685                         INIT_WORK(&dev->unmap_on_rmmod_work, unmap_device_work);
1686                         queue_work(system_long_wq, &dev->unmap_on_rmmod_work);
1687                 }
1688                 rnbd_clt_put_sess(sess);
1689         }
1690         /* Wait for all scheduled unmap works */
1691         flush_workqueue(system_long_wq);
1692         WARN_ON(!list_empty(&sess_list));
1693 }
1694
1695 static int __init rnbd_client_init(void)
1696 {
1697         int err = 0;
1698
1699         BUILD_BUG_ON(sizeof(struct rnbd_msg_hdr) != 4);
1700         BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info) != 36);
1701         BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info_rsp) != 36);
1702         BUILD_BUG_ON(sizeof(struct rnbd_msg_open) != 264);
1703         BUILD_BUG_ON(sizeof(struct rnbd_msg_close) != 8);
1704         BUILD_BUG_ON(sizeof(struct rnbd_msg_open_rsp) != 56);
1705         rnbd_client_major = register_blkdev(rnbd_client_major, "rnbd");
1706         if (rnbd_client_major <= 0) {
1707                 pr_err("Failed to load module, block device registration failed\n");
1708                 return -EBUSY;
1709         }
1710
1711         err = rnbd_clt_create_sysfs_files();
1712         if (err) {
1713                 pr_err("Failed to load module, creating sysfs device files failed, err: %d\n",
1714                        err);
1715                 unregister_blkdev(rnbd_client_major, "rnbd");
1716         }
1717
1718         return err;
1719 }
1720
1721 static void __exit rnbd_client_exit(void)
1722 {
1723         rnbd_destroy_sessions();
1724         unregister_blkdev(rnbd_client_major, "rnbd");
1725         ida_destroy(&index_ida);
1726 }
1727
1728 module_init(rnbd_client_init);
1729 module_exit(rnbd_client_exit);