1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/vmalloc.h>
3 #include <linux/bitmap.h>
6 #define CREATE_TRACE_POINTS
7 #include "null_blk_trace.h"
9 /* zone_size in MBs to sectors. */
10 #define ZONE_SIZE_SHIFT 11
12 static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
14 return sect >> ilog2(dev->zone_size_sects);
17 int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
19 sector_t dev_size = (sector_t)dev->size * 1024 * 1024;
23 if (!is_power_of_2(dev->zone_size)) {
24 pr_err("zone_size must be power-of-two\n");
27 if (dev->zone_size > dev->size) {
28 pr_err("Zone size larger than device capacity\n");
32 if (!dev->zone_capacity)
33 dev->zone_capacity = dev->zone_size;
35 if (dev->zone_capacity > dev->zone_size) {
36 pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n",
37 dev->zone_capacity, dev->zone_size);
41 dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
42 dev->nr_zones = dev_size >>
43 (SECTOR_SHIFT + ilog2(dev->zone_size_sects));
44 dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone),
45 GFP_KERNEL | __GFP_ZERO);
49 spin_lock_init(&dev->zone_dev_lock);
50 dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL);
51 if (!dev->zone_locks) {
56 if (dev->zone_nr_conv >= dev->nr_zones) {
57 dev->zone_nr_conv = dev->nr_zones - 1;
58 pr_info("changed the number of conventional zones to %u",
62 /* Max active zones has to be < nbr of seq zones in order to be enforceable */
63 if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) {
64 dev->zone_max_active = 0;
65 pr_info("zone_max_active limit disabled, limit >= zone count\n");
68 /* Max open zones has to be <= max active zones */
69 if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) {
70 dev->zone_max_open = dev->zone_max_active;
71 pr_info("changed the maximum number of open zones to %u\n",
73 } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) {
74 dev->zone_max_open = 0;
75 pr_info("zone_max_open limit disabled, limit >= zone count\n");
78 for (i = 0; i < dev->zone_nr_conv; i++) {
79 struct blk_zone *zone = &dev->zones[i];
82 zone->len = dev->zone_size_sects;
83 zone->capacity = zone->len;
84 zone->wp = zone->start + zone->len;
85 zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
86 zone->cond = BLK_ZONE_COND_NOT_WP;
88 sector += dev->zone_size_sects;
91 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
92 struct blk_zone *zone = &dev->zones[i];
94 zone->start = zone->wp = sector;
95 zone->len = dev->zone_size_sects;
96 zone->capacity = dev->zone_capacity << ZONE_SIZE_SHIFT;
97 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
98 zone->cond = BLK_ZONE_COND_EMPTY;
100 sector += dev->zone_size_sects;
103 q->limits.zoned = BLK_ZONED_HM;
104 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
105 blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
110 int null_register_zoned_dev(struct nullb *nullb)
112 struct nullb_device *dev = nullb->dev;
113 struct request_queue *q = nullb->q;
115 if (queue_is_mq(q)) {
116 int ret = blk_revalidate_disk_zones(nullb->disk, NULL);
121 blk_queue_chunk_sectors(q, dev->zone_size_sects);
122 q->nr_zones = blkdev_nr_zones(nullb->disk);
125 blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
126 blk_queue_max_open_zones(q, dev->zone_max_open);
127 blk_queue_max_active_zones(q, dev->zone_max_active);
132 void null_free_zoned_dev(struct nullb_device *dev)
134 bitmap_free(dev->zone_locks);
138 static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno)
140 wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE);
143 static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno)
145 clear_and_wake_up_bit(zno, dev->zone_locks);
148 int null_report_zones(struct gendisk *disk, sector_t sector,
149 unsigned int nr_zones, report_zones_cb cb, void *data)
151 struct nullb *nullb = disk->private_data;
152 struct nullb_device *dev = nullb->dev;
153 unsigned int first_zone, i, zno;
154 struct blk_zone zone;
157 first_zone = null_zone_no(dev, sector);
158 if (first_zone >= dev->nr_zones)
161 nr_zones = min(nr_zones, dev->nr_zones - first_zone);
162 trace_nullb_report_zones(nullb, nr_zones);
165 for (i = 0; i < nr_zones; i++, zno++) {
167 * Stacked DM target drivers will remap the zone information by
168 * modifying the zone information passed to the report callback.
169 * So use a local copy to avoid corruption of the device zone
172 null_lock_zone(dev, zno);
173 memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone));
174 null_unlock_zone(dev, zno);
176 error = cb(&zone, i, data);
185 * This is called in the case of memory backing from null_process_cmd()
186 * with the target zone already locked.
188 size_t null_zone_valid_read_len(struct nullb *nullb,
189 sector_t sector, unsigned int len)
191 struct nullb_device *dev = nullb->dev;
192 struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)];
193 unsigned int nr_sectors = len >> SECTOR_SHIFT;
195 /* Read must be below the write pointer position */
196 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
197 sector + nr_sectors <= zone->wp)
200 if (sector > zone->wp)
203 return (zone->wp - sector) << SECTOR_SHIFT;
206 static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone)
208 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
209 return BLK_STS_IOERR;
211 switch (zone->cond) {
212 case BLK_ZONE_COND_CLOSED:
213 /* close operation on closed is not an error */
215 case BLK_ZONE_COND_IMP_OPEN:
216 dev->nr_zones_imp_open--;
218 case BLK_ZONE_COND_EXP_OPEN:
219 dev->nr_zones_exp_open--;
221 case BLK_ZONE_COND_EMPTY:
222 case BLK_ZONE_COND_FULL:
224 return BLK_STS_IOERR;
227 if (zone->wp == zone->start) {
228 zone->cond = BLK_ZONE_COND_EMPTY;
230 zone->cond = BLK_ZONE_COND_CLOSED;
231 dev->nr_zones_closed++;
237 static void null_close_first_imp_zone(struct nullb_device *dev)
241 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
242 if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) {
243 null_close_zone(dev, &dev->zones[i]);
249 static blk_status_t null_check_active(struct nullb_device *dev)
251 if (!dev->zone_max_active)
254 if (dev->nr_zones_exp_open + dev->nr_zones_imp_open +
255 dev->nr_zones_closed < dev->zone_max_active)
258 return BLK_STS_ZONE_ACTIVE_RESOURCE;
261 static blk_status_t null_check_open(struct nullb_device *dev)
263 if (!dev->zone_max_open)
266 if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open)
269 if (dev->nr_zones_imp_open) {
270 if (null_check_active(dev) == BLK_STS_OK) {
271 null_close_first_imp_zone(dev);
276 return BLK_STS_ZONE_OPEN_RESOURCE;
280 * This function matches the manage open zone resources function in the ZBC standard,
281 * with the addition of max active zones support (added in the ZNS standard).
283 * The function determines if a zone can transition to implicit open or explicit open,
284 * while maintaining the max open zone (and max active zone) limit(s). It may close an
285 * implicit open zone in order to make additional zone resources available.
287 * ZBC states that an implicit open zone shall be closed only if there is not
288 * room within the open limit. However, with the addition of an active limit,
289 * it is not certain that closing an implicit open zone will allow a new zone
290 * to be opened, since we might already be at the active limit capacity.
292 static blk_status_t null_check_zone_resources(struct nullb_device *dev, struct blk_zone *zone)
296 switch (zone->cond) {
297 case BLK_ZONE_COND_EMPTY:
298 ret = null_check_active(dev);
299 if (ret != BLK_STS_OK)
302 case BLK_ZONE_COND_CLOSED:
303 return null_check_open(dev);
305 /* Should never be called for other states */
307 return BLK_STS_IOERR;
311 static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
312 unsigned int nr_sectors, bool append)
314 struct nullb_device *dev = cmd->nq->dev;
315 unsigned int zno = null_zone_no(dev, sector);
316 struct blk_zone *zone = &dev->zones[zno];
319 trace_nullb_zone_op(cmd, zno, zone->cond);
321 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
322 return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
324 null_lock_zone(dev, zno);
325 spin_lock(&dev->zone_dev_lock);
327 switch (zone->cond) {
328 case BLK_ZONE_COND_FULL:
329 /* Cannot write to a full zone */
332 case BLK_ZONE_COND_EMPTY:
333 case BLK_ZONE_COND_CLOSED:
334 ret = null_check_zone_resources(dev, zone);
335 if (ret != BLK_STS_OK)
338 case BLK_ZONE_COND_IMP_OPEN:
339 case BLK_ZONE_COND_EXP_OPEN:
342 /* Invalid zone condition */
348 * Regular writes must be at the write pointer position.
349 * Zone append writes are automatically issued at the write
350 * pointer and the position returned using the request or BIO
356 cmd->bio->bi_iter.bi_sector = sector;
358 cmd->rq->__sector = sector;
359 } else if (sector != zone->wp) {
364 if (zone->wp + nr_sectors > zone->start + zone->capacity) {
369 if (zone->cond == BLK_ZONE_COND_CLOSED) {
370 dev->nr_zones_closed--;
371 dev->nr_zones_imp_open++;
372 } else if (zone->cond == BLK_ZONE_COND_EMPTY) {
373 dev->nr_zones_imp_open++;
375 if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
376 zone->cond = BLK_ZONE_COND_IMP_OPEN;
378 spin_unlock(&dev->zone_dev_lock);
379 ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
380 spin_lock(&dev->zone_dev_lock);
381 if (ret != BLK_STS_OK)
384 zone->wp += nr_sectors;
385 if (zone->wp == zone->start + zone->capacity) {
386 if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
387 dev->nr_zones_exp_open--;
388 else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
389 dev->nr_zones_imp_open--;
390 zone->cond = BLK_ZONE_COND_FULL;
395 spin_unlock(&dev->zone_dev_lock);
396 null_unlock_zone(dev, zno);
401 static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone)
405 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
406 return BLK_STS_IOERR;
408 switch (zone->cond) {
409 case BLK_ZONE_COND_EXP_OPEN:
410 /* open operation on exp open is not an error */
412 case BLK_ZONE_COND_EMPTY:
413 ret = null_check_zone_resources(dev, zone);
414 if (ret != BLK_STS_OK)
417 case BLK_ZONE_COND_IMP_OPEN:
418 dev->nr_zones_imp_open--;
420 case BLK_ZONE_COND_CLOSED:
421 ret = null_check_zone_resources(dev, zone);
422 if (ret != BLK_STS_OK)
424 dev->nr_zones_closed--;
426 case BLK_ZONE_COND_FULL:
428 return BLK_STS_IOERR;
431 zone->cond = BLK_ZONE_COND_EXP_OPEN;
432 dev->nr_zones_exp_open++;
437 static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone)
441 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
442 return BLK_STS_IOERR;
444 switch (zone->cond) {
445 case BLK_ZONE_COND_FULL:
446 /* finish operation on full is not an error */
448 case BLK_ZONE_COND_EMPTY:
449 ret = null_check_zone_resources(dev, zone);
450 if (ret != BLK_STS_OK)
453 case BLK_ZONE_COND_IMP_OPEN:
454 dev->nr_zones_imp_open--;
456 case BLK_ZONE_COND_EXP_OPEN:
457 dev->nr_zones_exp_open--;
459 case BLK_ZONE_COND_CLOSED:
460 ret = null_check_zone_resources(dev, zone);
461 if (ret != BLK_STS_OK)
463 dev->nr_zones_closed--;
466 return BLK_STS_IOERR;
469 zone->cond = BLK_ZONE_COND_FULL;
470 zone->wp = zone->start + zone->len;
475 static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone)
477 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
478 return BLK_STS_IOERR;
480 switch (zone->cond) {
481 case BLK_ZONE_COND_EMPTY:
482 /* reset operation on empty is not an error */
484 case BLK_ZONE_COND_IMP_OPEN:
485 dev->nr_zones_imp_open--;
487 case BLK_ZONE_COND_EXP_OPEN:
488 dev->nr_zones_exp_open--;
490 case BLK_ZONE_COND_CLOSED:
491 dev->nr_zones_closed--;
493 case BLK_ZONE_COND_FULL:
496 return BLK_STS_IOERR;
499 zone->cond = BLK_ZONE_COND_EMPTY;
500 zone->wp = zone->start;
505 static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
508 struct nullb_device *dev = cmd->nq->dev;
509 unsigned int zone_no;
510 struct blk_zone *zone;
514 if (op == REQ_OP_ZONE_RESET_ALL) {
515 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
516 null_lock_zone(dev, i);
517 zone = &dev->zones[i];
518 if (zone->cond != BLK_ZONE_COND_EMPTY) {
519 spin_lock(&dev->zone_dev_lock);
520 null_reset_zone(dev, zone);
521 spin_unlock(&dev->zone_dev_lock);
522 trace_nullb_zone_op(cmd, i, zone->cond);
524 null_unlock_zone(dev, i);
529 zone_no = null_zone_no(dev, sector);
530 zone = &dev->zones[zone_no];
532 null_lock_zone(dev, zone_no);
533 spin_lock(&dev->zone_dev_lock);
536 case REQ_OP_ZONE_RESET:
537 ret = null_reset_zone(dev, zone);
539 case REQ_OP_ZONE_OPEN:
540 ret = null_open_zone(dev, zone);
542 case REQ_OP_ZONE_CLOSE:
543 ret = null_close_zone(dev, zone);
545 case REQ_OP_ZONE_FINISH:
546 ret = null_finish_zone(dev, zone);
549 ret = BLK_STS_NOTSUPP;
553 spin_unlock(&dev->zone_dev_lock);
555 if (ret == BLK_STS_OK)
556 trace_nullb_zone_op(cmd, zone_no, zone->cond);
558 null_unlock_zone(dev, zone_no);
563 blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
564 sector_t sector, sector_t nr_sectors)
566 struct nullb_device *dev = cmd->nq->dev;
567 unsigned int zno = null_zone_no(dev, sector);
572 sts = null_zone_write(cmd, sector, nr_sectors, false);
574 case REQ_OP_ZONE_APPEND:
575 sts = null_zone_write(cmd, sector, nr_sectors, true);
577 case REQ_OP_ZONE_RESET:
578 case REQ_OP_ZONE_RESET_ALL:
579 case REQ_OP_ZONE_OPEN:
580 case REQ_OP_ZONE_CLOSE:
581 case REQ_OP_ZONE_FINISH:
582 sts = null_zone_mgmt(cmd, op, sector);
585 null_lock_zone(dev, zno);
586 sts = null_process_cmd(cmd, op, sector, nr_sectors);
587 null_unlock_zone(dev, zno);