diff --git a/drivers/md/md.c b/drivers/md/md.c index 541151bcfe81..0fde115e921f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8625,50 +8625,55 @@ void md_cluster_stop(struct mddev *mddev) put_cluster_ops(mddev); } -static int is_mddev_idle(struct mddev *mddev, int init) +static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) { + unsigned long last_events = rdev->last_events; + + if (!bdev_is_partition(rdev->bdev)) + return true; + + /* + * If rdev is partition, and user doesn't issue IO to the array, the + * array is still not idle if user issues IO to other partitions. + */ + rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, + sectors) - + part_stat_read_accum(rdev->bdev, sectors); + + return init || rdev->last_events <= last_events; +} + +/* + * mddev is idle if following conditions are matched since last check: + * 1) mddev doesn't have normal IO completed; + * 2) mddev doesn't have inflight normal IO; + * 3) if any member disk is partition, and other partitions don't have IO + * completed; + * + * Noted this checking rely on IO accounting is enabled. + */ +static bool is_mddev_idle(struct mddev *mddev, int init) +{ + unsigned long last_events = mddev->normal_io_events; + struct gendisk *disk; struct md_rdev *rdev; - int idle; - int curr_events; + bool idle = true; + + disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; + if (!disk) + return true; + + mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); + if (!init && (mddev->normal_io_events > last_events || + bdev_count_inflight(disk->part0))) + idle = false; - idle = 1; rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) { - struct gendisk *disk = rdev->bdev->bd_disk; - - if (!init && !blk_queue_io_stat(disk->queue)) - continue; - - curr_events = (int)part_stat_read_accum(disk->part0, sectors) - - atomic_read(&disk->sync_io); - /* sync IO will cause sync_io to increase before the disk_stats - * as sync_io is counted when a request starts, and - * disk_stats is counted when it completes. - * So resync activity will cause curr_events to be smaller than - * when there was no such activity. - * non-sync IO will cause disk_stat to increase without - * increasing sync_io so curr_events will (eventually) - * be larger than it was before. Once it becomes - * substantially larger, the test below will cause - * the array to appear non-idle, and resync will slow - * down. - * If there is a lot of outstanding resync activity when - * we set last_event to curr_events, then all that activity - * completing might cause the array to appear non-idle - * and resync will be slowed down even though there might - * not have been non-resync activity. This will only - * happen once though. 'last_events' will soon reflect - * the state where there is little or no outstanding - * resync requests, and further resync activity will - * always make curr_events less than last_events. - * - */ - if (init || curr_events - rdev->last_events > 64) { - rdev->last_events = curr_events; - idle = 0; - } - } + rdev_for_each_rcu(rdev, mddev) + if (!is_rdev_holder_idle(rdev, init)) + idle = false; rcu_read_unlock(); + return idle; } diff --git a/drivers/md/md.h b/drivers/md/md.h index b57842188f18..1982f1f18627 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -132,7 +132,7 @@ struct md_rdev { sector_t sectors; /* Device size (in 512bytes sectors) */ struct mddev *mddev; /* RAID array if running */ - int last_events; /* IO event timestamp */ + unsigned long last_events; /* IO event timestamp */ /* * If meta_bdev is non-NULL, it means that a separate device is @@ -520,6 +520,7 @@ struct mddev { * adding a spare */ + unsigned long normal_io_events; /* IO event timestamp */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; sector_t recovery_cp;