#include <asm/atomic.h>
#include "raid6.h"
+#include <linux/raid/bitmap.h>
+
/*
* Stripe cache
*/
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state))
list_add_tail(&sh->lru, &conf->delayed_list);
- else
+ else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ conf->seq_write == sh->bm_seq)
+ list_add_tail(&sh->lru, &conf->bitmap_list);
+ else {
+ clear_bit(STRIPE_BIT_DELAY, &sh->state);
list_add_tail(&sh->lru, &conf->handle_list);
+ }
md_wakeup_thread(conf->mddev->thread);
} else {
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
spin_lock_irq(&conf->device_lock);
do {
+ wait_event_lock_irq(conf->wait_for_stripe,
+ conf->quiesce == 0,
+ conf->device_lock, /* nothing */);
sh = __find_stripe(conf, sector);
if (!sh) {
if (!conf->inactive_blocked)
raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
PRINTK("raid6: error called\n");
- if (!rdev->faulty) {
+ if (!test_bit(Faulty, &rdev->flags)) {
mddev->sb_dirty = 1;
- if (rdev->in_sync) {
+ if (test_bit(In_sync, &rdev->flags)) {
conf->working_disks--;
mddev->degraded++;
conf->failed_disks++;
- rdev->in_sync = 0;
+ clear_bit(In_sync, &rdev->flags);
/*
* if recovery was running, make sure it aborts.
*/
set_bit(MD_RECOVERY_ERR, &mddev->recovery);
}
- rdev->faulty = 1;
+ set_bit(Faulty, &rdev->flags);
printk (KERN_ALERT
"raid6: Disk failure on %s, disabling device."
" Operation continuing on %d devices\n",
{
struct bio **bip;
raid6_conf_t *conf = sh->raid_conf;
+ int firstwrite=0;
PRINTK("adding bh b#%llu to stripe s#%llu\n",
(unsigned long long)bi->bi_sector,
spin_lock(&sh->lock);
spin_lock_irq(&conf->device_lock);
- if (forwrite)
+ if (forwrite) {
bip = &sh->dev[dd_idx].towrite;
- else
+ if (*bip == NULL && sh->dev[dd_idx].written == NULL)
+ firstwrite = 1;
+ } else
bip = &sh->dev[dd_idx].toread;
while (*bip && (*bip)->bi_sector < bi->bi_sector) {
if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
(unsigned long long)bi->bi_sector,
(unsigned long long)sh->sector, dd_idx);
+ if (conf->mddev->bitmap && firstwrite) {
+ sh->bm_seq = conf->seq_write;
+ bitmap_startwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS, 0);
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
+ }
+
if (forwrite) {
/* check if page is covered */
sector_t sector = sh->dev[dd_idx].sector;
}
if (dev->written) written++;
rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
- if (!rdev || !rdev->in_sync) {
+ if (!rdev || !test_bit(In_sync, &rdev->flags)) {
if ( failed < 2 )
failed_num[failed] = i;
failed++;
* need to be failed
*/
if (failed > 2 && to_read+to_write+written) {
- spin_lock_irq(&conf->device_lock);
for (i=disks; i--; ) {
+ int bitmap_end = 0;
+ spin_lock_irq(&conf->device_lock);
/* fail all writes first */
bi = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
- if (bi) to_write--;
+ if (bi) { to_write--; bitmap_end = 1; }
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
+ if (bi) bitmap_end = 1;
while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
bi = nextbi;
}
}
+ spin_unlock_irq(&conf->device_lock);
+ if (bitmap_end)
+ bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS, 0, 0);
}
- spin_unlock_irq(&conf->device_lock);
}
if (failed > 2 && syncing) {
md_done_sync(conf->mddev, STRIPE_SECTORS,0);
if (!test_bit(R5_LOCKED, &dev->flags) &&
test_bit(R5_UPTODATE, &dev->flags) ) {
/* We can return any write requests */
+ int bitmap_end = 0;
struct bio *wbi, *wbi2;
PRINTK("Return write for stripe %llu disc %d\n",
(unsigned long long)sh->sector, i);
}
wbi = wbi2;
}
+ if (dev->towrite == NULL)
+ bitmap_end = 1;
spin_unlock_irq(&conf->device_lock);
+ if (bitmap_end)
+ bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+ STRIPE_SECTORS,
+ !test_bit(STRIPE_DEGRADED, &sh->state), 0);
}
}
}
}
}
/* now if nothing is locked, and if we have enough data, we can start a write request */
- if (locked == 0 && rcw == 0) {
+ if (locked == 0 && rcw == 0 &&
+ !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
if ( must_compute > 0 ) {
/* We have failed blocks and need to compute them */
switch ( failed ) {
bdev = &sh->dev[failed_num[1]];
locked += !test_bit(R5_LOCKED, &bdev->flags);
set_bit(R5_LOCKED, &bdev->flags);
+ clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(R5_Wantwrite, &bdev->flags);
set_bit(STRIPE_INSYNC, &sh->state);
bi->bi_end_io = raid6_end_read_request;
rcu_read_lock();
- rdev = conf->disks[i].rdev;
- if (rdev && rdev->faulty)
+ rdev = rcu_dereference(conf->disks[i].rdev);
+ if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev)
atomic_inc(&rdev->nr_pending);
bi->bi_next = NULL;
generic_make_request(bi);
} else {
+ if (rw == 1)
+ set_bit(STRIPE_DEGRADED, &sh->state);
PRINTK("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
}
}
+static inline void activate_bit_delay(raid6_conf_t *conf)
+{
+ /* device_lock is held */
+ struct list_head head;
+ list_add(&head, &conf->bitmap_list);
+ list_del_init(&conf->bitmap_list);
+ while (!list_empty(&head)) {
+ struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+ list_del_init(&sh->lru);
+ atomic_inc(&sh->count);
+ __release_stripe(conf, sh);
+ }
+}
+
static void unplug_slaves(mddev_t *mddev)
{
raid6_conf_t *conf = mddev_to_conf(mddev);
rcu_read_lock();
for (i=0; i<mddev->raid_disks; i++) {
- mdk_rdev_t *rdev = conf->disks[i].rdev;
- if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
atomic_inc(&rdev->nr_pending);
spin_lock_irqsave(&conf->device_lock, flags);
- if (blk_remove_plug(q))
+ if (blk_remove_plug(q)) {
+ conf->seq_flush++;
raid6_activate_delayed(conf);
+ }
md_wakeup_thread(mddev->thread);
spin_unlock_irqrestore(&conf->device_lock, flags);
rcu_read_lock();
for (i=0; i<mddev->raid_disks && ret == 0; i++) {
- mdk_rdev_t *rdev = conf->disks[i].rdev;
- if (rdev && !rdev->faulty) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct block_device *bdev = rdev->bdev;
request_queue_t *r_queue = bdev_get_queue(bdev);
sector_t new_sector;
sector_t logical_sector, last_sector;
struct stripe_head *sh;
+ const int rw = bio_data_dir(bi);
if (unlikely(bio_barrier(bi))) {
bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
md_write_start(mddev, bi);
- if (bio_data_dir(bi)==WRITE) {
- disk_stat_inc(mddev->gendisk, writes);
- disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
- } else {
- disk_stat_inc(mddev->gendisk, reads);
- disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi));
- }
+ disk_stat_inc(mddev->gendisk, ios[rw]);
+ disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
last_sector = bi->bi_sector + (bi->bi_size>>9);
if (--bi->bi_phys_segments == 0) {
int bytes = bi->bi_size;
- if ( bio_data_dir(bi) == WRITE )
+ if (rw == WRITE )
md_write_end(mddev);
bi->bi_size = 0;
bi->bi_end_io(bi, bytes, 0);
sector_t first_sector;
int raid_disks = conf->raid_disks;
int data_disks = raid_disks - 2;
+ sector_t max_sector = mddev->size << 1;
+ int sync_blocks;
+ int still_degraded = 0;
+ int i;
- if (sector_nr >= mddev->size <<1) {
+ if (sector_nr >= max_sector) {
/* just being told to finish up .. nothing much to do */
unplug_slaves(mddev);
+
+ if (mddev->curr_resync < max_sector) /* aborted */
+ bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+ &sync_blocks, 1);
+ else /* completed sync */
+ conf->fullsync = 0;
+ bitmap_close_sync(mddev->bitmap);
+
return 0;
}
/* if there are 2 or more failed drives and we are trying
*skipped = 1;
return rv;
}
+ if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
+ /* we can skip this block, and probably more */
+ sync_blocks /= STRIPE_SECTORS;
+ *skipped = 1;
+ return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
+ }
x = sector_nr;
chunk_offset = sector_div(x, sectors_per_chunk);
/* make sure we don't swamp the stripe cache if someone else
* is trying to get access
*/
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(1);
+ schedule_timeout_uninterruptible(1);
}
+ /* Need to check if array will still be degraded after recovery/resync
+ * We don't need to check the 'failed' flag as when that gets set,
+ * recovery aborts.
+ */
+ for (i=0; i<mddev->raid_disks; i++)
+ if (conf->disks[i].rdev == NULL)
+ still_degraded = 1;
+
+ bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
+
spin_lock(&sh->lock);
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
while (1) {
struct list_head *first;
+ if (conf->seq_flush - conf->seq_write > 0) {
+ int seq = conf->seq_flush;
+ spin_unlock_irq(&conf->device_lock);
+ bitmap_unplug(mddev->bitmap);
+ spin_lock_irq(&conf->device_lock);
+ conf->seq_write = seq;
+ activate_bit_delay(conf);
+ }
+
if (list_empty(&conf->handle_list) &&
atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
!blk_queue_plugged(mddev->queue) &&
PRINTK("--- raid6d inactive\n");
}
-static int run (mddev_t *mddev)
+static int run(mddev_t *mddev)
{
raid6_conf_t *conf;
int raid_disk, memory;
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->delayed_list);
+ INIT_LIST_HEAD(&conf->bitmap_list);
INIT_LIST_HEAD(&conf->inactive_list);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
disk->rdev = rdev;
- if (rdev->in_sync) {
+ if (test_bit(In_sync, &rdev->flags)) {
char b[BDEVNAME_SIZE];
printk(KERN_INFO "raid6: device %s operational as raid"
" disk %d\n", bdevname(rdev->bdev,b),
/* Ok, everything is just fine now */
mddev->array_size = mddev->size * (mddev->raid_disks - 2);
+ if (mddev->bitmap)
+ mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+
mddev->queue->unplug_fn = raid6_unplug_device;
mddev->queue->issue_flush_fn = raid6_issue_flush;
return 0;
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
conf->disks[i].rdev &&
- conf->disks[i].rdev->in_sync ? "U" : "_");
+ test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
seq_printf (seq, "]");
#if RAID6_DUMPSTATE
seq_printf (seq, "\n");
tmp = conf->disks + i;
if (tmp->rdev)
printk(" disk %d, o:%d, dev:%s\n",
- i, !tmp->rdev->faulty,
+ i, !test_bit(Faulty, &tmp->rdev->flags),
bdevname(tmp->rdev->bdev,b));
}
}
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->disks + i;
if (tmp->rdev
- && !tmp->rdev->faulty
- && !tmp->rdev->in_sync) {
+ && !test_bit(Faulty, &tmp->rdev->flags)
+ && !test_bit(In_sync, &tmp->rdev->flags)) {
mddev->degraded--;
conf->failed_disks--;
conf->working_disks++;
- tmp->rdev->in_sync = 1;
+ set_bit(In_sync, &tmp->rdev->flags);
}
}
print_raid6_conf(conf);
print_raid6_conf(conf);
rdev = p->rdev;
if (rdev) {
- if (rdev->in_sync ||
+ if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) {
err = -EBUSY;
goto abort;
/* no point adding a device */
return 0;
/*
- * find the disk ...
+ * find the disk ... but prefer rdev->saved_raid_disk
+ * if possible.
*/
- for (disk=0; disk < mddev->raid_disks; disk++)
+ if (rdev->saved_raid_disk >= 0 &&
+ conf->disks[rdev->saved_raid_disk].rdev == NULL)
+ disk = rdev->saved_raid_disk;
+ else
+ disk = 0;
+ for ( ; disk < mddev->raid_disks; disk++)
if ((p=conf->disks + disk)->rdev == NULL) {
- rdev->in_sync = 0;
+ clear_bit(In_sync, &rdev->flags);
rdev->raid_disk = disk;
found = 1;
- p->rdev = rdev;
+ if (rdev->saved_raid_disk != disk)
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->rdev, rdev);
break;
}
print_raid6_conf(conf);
return 0;
}
+static void raid6_quiesce(mddev_t *mddev, int state)
+{
+ raid6_conf_t *conf = mddev_to_conf(mddev);
+
+ switch(state) {
+ case 1: /* stop all writes */
+ spin_lock_irq(&conf->device_lock);
+ conf->quiesce = 1;
+ wait_event_lock_irq(conf->wait_for_stripe,
+ atomic_read(&conf->active_stripes) == 0,
+ conf->device_lock, /* nothing */);
+ spin_unlock_irq(&conf->device_lock);
+ break;
+
+ case 0: /* re-enable writes */
+ spin_lock_irq(&conf->device_lock);
+ conf->quiesce = 0;
+ wake_up(&conf->wait_for_stripe);
+ spin_unlock_irq(&conf->device_lock);
+ break;
+ }
+ if (mddev->thread) {
+ if (mddev->bitmap)
+ mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+ else
+ mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ md_wakeup_thread(mddev->thread);
+ }
+}
static mdk_personality_t raid6_personality=
{
.name = "raid6",
.spare_active = raid6_spare_active,
.sync_request = sync_request,
.resize = raid6_resize,
+ .quiesce = raid6_quiesce,
};
static int __init raid6_init (void)