// SPDX-License-Identifier: GPL-2.0-or-later /* md.c : Multiple Devices driver for Linux Copyright (C) 1998, 1999, 2000 Ingo Molnar
completely rewritten, based on the MD driver code from Marc Zyngier
Changes:
- RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> - kerneld support by Boris Tobotras <boris@xtalk.msk.su> - kmod support by: Cyrus Durgin - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
- lots of fixes and improvements to the RAID1/RAID5 and generic RAID code (such as request based resynchronization):
Neil Brown <neilb@cse.unsw.edu.au>.
- persistent bitmap code Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
Errors, Warnings, etc. Please use: pr_crit() for error conditions that risk data loss pr_err() for error conditions that are unexpected, like an IO error or internal inconsistency pr_warn() for error conditions that could have been predicated, like adding a device to an array when it has incompatible metadata pr_info() for every interesting, very rare events, like an array starting or stopping, or resync starting or stopping pr_debug() for everything else.
/* * This workqueue is used for sync_work to register new sync_thread, and for * del_work to remove rdev, and for event_work that is only set by dm-raid. * * Noted that sync_work will grab reconfig_mutex, hence never flush this * workqueue whith reconfig_mutex grabbed.
*/ staticstruct workqueue_struct *md_misc_wq; struct workqueue_struct *md_bitmap_wq;
/* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error * count by 2 for every hour elapsed between read errors.
*/ #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 /* Default safemode delay: 200 msec */ #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load * does not show up that much. Increase it if you want to have more guaranteed * speed. Note that the RAID driver will use the maximum bandwidth * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. * * Background sync IO speed control: * * - below speed min: * no limit; * - above speed min and below speed max: * a) if mddev is idle, then no limit; * b) if mddev is busy handling normal IO, then limit inflight sync IO * to sync_io_depth; * - above speed max: * sync IO can't be issued; * * Following configurations can be changed via /proc/sys/dev/raid/ for system * or /sys/block/mdX/md/ for one array.
*/ staticint sysctl_speed_limit_min = 1000; staticint sysctl_speed_limit_max = 200000; staticint sysctl_sync_io_depth = 32;
staticint rdevs_init_serial(struct mddev *mddev)
{ struct md_rdev *rdev; int ret = 0;
rdev_for_each(rdev, mddev) {
ret = rdev_init_serial(rdev); if (ret) break;
}
/* Free all resources if pool is not existed */ if (ret && !mddev->serial_info_pool)
rdevs_uninit_serial(mddev);
return ret;
}
/* * rdev needs to enable serial stuffs if it meets the conditions: * 1. it is multi-queue device flaged with writemostly. * 2. the write-behind mode is enabled.
*/ staticint rdev_need_serial(struct md_rdev *rdev)
{ return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
test_bit(WriteMostly, &rdev->flags));
}
/* * Init resource for rdev(s), then create serial_info_pool if: * 1. rdev is the first device which return true from rdev_enable_serial. * 2. rdev is NULL, means we want to enable serialization for all rdevs.
*/ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{ int ret = 0;
if (rdev && !rdev_need_serial(rdev) &&
!test_bit(CollisionCheck, &rdev->flags)) return;
if (!rdev)
ret = rdevs_init_serial(mddev); else
ret = rdev_init_serial(rdev); if (ret) return;
if (mddev->serial_info_pool == NULL) { /* * already in memalloc noio context by * mddev_suspend()
*/
mddev->serial_info_pool =
mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); if (!mddev->serial_info_pool) {
rdevs_uninit_serial(mddev);
pr_err("can't alloc memory pool for serialization\n");
}
}
}
/* * Free resource from rdev(s), and destroy serial_info_pool under conditions: * 1. rdev is the last device flaged with CollisionCheck. * 2. when bitmap is destroyed while policy is not enabled. * 3. for disable policy, the pool is destroyed only when no rdev needs it.
*/ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{ if (rdev && !test_bit(CollisionCheck, &rdev->flags)) return;
if (mddev->serial_info_pool) { struct md_rdev *temp; int num = 0; /* used to track if other rdevs need the pool */
if (num)
pr_info("The mempool could be used by other devices\n"); else {
mempool_destroy(mddev->serial_info_pool);
mddev->serial_info_pool = NULL;
}
}
}
/* * The original mechanism for creating an md device is to create * a device node in /dev and to open it. This causes races with device-close. * The preferred method is to write to the "new_array" module parameter. * This can avoid races. * Setting create_on_open to false disables the original mechanism * so all the races disappear.
*/ staticbool create_on_open = true; staticbool legacy_async_del_gendisk = true;
/* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat * can use 'poll' or 'select' to find out when the event * count increases. * * Events are: * start array, stop array, error, add device, remove device, * start build, activate spare
*/ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; void md_new_event(void)
{
atomic_inc(&md_event_count);
wake_up(&md_event_waiters);
}
EXPORT_SYMBOL_GPL(md_new_event);
/* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list.
*/ static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock);
staticbool is_md_suspended(struct mddev *mddev)
{ return percpu_ref_is_dying(&mddev->active_io);
} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. * We hold a refcount over the call to ->make_request. By the time that * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more.
*/ staticbool is_suspended(struct mddev *mddev, struct bio *bio)
{ if (is_md_suspended(mddev)) returntrue; if (bio_data_dir(bio) != WRITE) returnfalse; if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) returnfalse; if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) returnfalse; if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) returnfalse; returntrue;
}
bool md_handle_request(struct mddev *mddev, struct bio *bio)
{
check_suspended: if (is_suspended(mddev, bio)) {
DEFINE_WAIT(__wait); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio); returntrue;
} for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
TASK_UNINTERRUPTIBLE); if (!is_suspended(mddev, bio)) break;
schedule();
}
finish_wait(&mddev->sb_wait, &__wait);
} if (!percpu_ref_tryget_live(&mddev->active_io)) goto check_suspended;
if (!mddev->pers->make_request(mddev, bio)) {
percpu_ref_put(&mddev->active_io); if (!mddev->gendisk && mddev->pers->prepare_suspend) returnfalse; goto check_suspended;
}
if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0)
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio); return;
}
/* bio could be mergeable after passing to underlayer */
bio->bi_opf &= ~REQ_NOMERGE;
md_handle_request(mddev, bio);
}
/* * Make sure no new requests are submitted to the device, and any requests that * have been submitted are completely handled.
*/ int mddev_suspend(struct mddev *mddev, bool interruptible)
{ int err = 0;
/* * hold reconfig_mutex to wait for normal io will deadlock, because * other context can't update super_block, and normal io can rely on * updating super_block.
*/
lockdep_assert_not_held(&mddev->reconfig_mutex);
if (interruptible)
err = mutex_lock_interruptible(&mddev->suspend_mutex); else
mutex_lock(&mddev->suspend_mutex); if (err) return err;
percpu_ref_kill(&mddev->active_io); if (interruptible)
err = wait_event_interruptible(mddev->sb_wait,
percpu_ref_is_zero(&mddev->active_io)); else
wait_event(mddev->sb_wait,
percpu_ref_is_zero(&mddev->active_io)); if (err) {
percpu_ref_resurrect(&mddev->active_io);
mutex_unlock(&mddev->suspend_mutex); return err;
}
/* * For raid456, io might be waiting for reshape to make progress, * allow new reshape to start while waiting for io to be done to * prevent deadlock.
*/
WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
/* restrict memory reclaim I/O during raid array is suspend */
mddev->noio_flag = memalloc_noio_save();
if (recovery_needed)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
/* sync bdev before setting device to readonly or stopping raid*/ staticint mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
{
mutex_lock(&mddev->open_mutex); if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
mutex_unlock(&mddev->open_mutex); return -EBUSY;
} if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
mutex_unlock(&mddev->open_mutex); return -EBUSY;
}
mutex_unlock(&mddev->open_mutex);
sync_blockdev(mddev->gendisk->part0); return 0;
}
/* * The only difference from bio_chain_endio() is that the current * bi_status of bio does not affect the bi_status of parent.
*/ staticvoid md_end_flush(struct bio *bio)
{ struct bio *parent = bio->bi_private;
/* * If any flush io error before the power failure, * disk data may be lost.
*/ if (bio->bi_status)
pr_err("md: %pg flush io error %d\n", bio->bi_bdev,
blk_status_to_errno(bio->bi_status));
bio_put(bio);
bio_endio(parent);
}
bool md_flush_request(struct mddev *mddev, struct bio *bio)
{ struct md_rdev *rdev; struct bio *new;
/* * md_flush_reqeust() should be called under md_handle_request() and * 'active_io' is already grabbed. Hence it's safe to get rdev directly * without rcu protection.
*/
WARN_ON(percpu_ref_is_zero(&mddev->active_io));
/* * If array is freed by stopping array, MD_DELETED is set by * do_md_stop(), MD_DELETED is still set here in case mddev is freed * directly by closing a mddev that is created by create_on_open.
*/
set_bit(MD_DELETED, &mddev->flags); /* * Call queue_work inside the spinlock so that flush_workqueue() after * mddev_find will succeed in waiting for the work to be done.
*/
queue_work(md_misc_wq, &mddev->del_work);
}
staticvoid mddev_put_locked(struct mddev *mddev)
{ if (atomic_dec_and_test(&mddev->active))
__mddev_put(mddev);
}
void mddev_put(struct mddev *mddev)
{ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return;
if (!list_empty(&mddev->deleting))
list_splice_init(&mddev->deleting, &delete);
if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as * an access to the files will try to take reconfig_mutex * while holding the file unremovable, which leads to * a deadlock. * So hold set sysfs_active while the remove in happeing, * and anything else which might set ->to_remove or my * otherwise change the sysfs namespace will fail with * -EBUSY if sysfs_active is still set. * We set sysfs_active under reconfig_mutex and elsewhere * test it under the same mutex to ensure its correct value * is seen.
*/ conststruct attribute_group *to_remove = mddev->to_remove;
mddev->to_remove = NULL;
mddev->sysfs_active = 1;
mutex_unlock(&mddev->reconfig_mutex);
if (mddev->kobj.sd) { if (to_remove != &md_redundancy_group)
sysfs_remove_group(&mddev->kobj, to_remove); if (mddev->pers == NULL ||
mddev->pers->sync_request == NULL) {
sysfs_remove_group(&mddev->kobj, &md_redundancy_group); if (mddev->sysfs_action)
sysfs_put(mddev->sysfs_action); if (mddev->sysfs_completed)
sysfs_put(mddev->sysfs_completed); if (mddev->sysfs_degraded)
sysfs_put(mddev->sysfs_degraded);
mddev->sysfs_action = NULL;
mddev->sysfs_completed = NULL;
mddev->sysfs_degraded = NULL;
}
}
mddev->sysfs_active = 0;
} else
mutex_unlock(&mddev->reconfig_mutex);
if (!legacy_async_del_gendisk) { /* * Call del_gendisk after release reconfig_mutex to avoid * deadlock (e.g. call del_gendisk under the lock and an * access to sysfs files waits the lock) * And MD_DELETED is only used for md raid which is set in * do_md_stop. dm raid only uses md_stop to stop. So dm raid * doesn't need to check MD_DELETED when getting reconfig lock
*/ if (test_bit(MD_DELETED, &mddev->flags))
del_gendisk(mddev->gendisk);
}
}
EXPORT_SYMBOL_GPL(mddev_unlock);
xa_lock(&md_submodule);
xa_for_each(&md_submodule, i, head) { if (head->type != MD_PERSONALITY) continue; if ((level != LEVEL_NONE && head->id == level) ||
!strcmp(head->name, clevel)) { if (try_module_get(head->owner))
ret = (void *)head; break;
}
}
xa_unlock(&md_submodule);
if (!ret) { if (level != LEVEL_NONE)
pr_warn("md: personality for level %d is not loaded!\n",
level); else
pr_warn("md: personality for level %s is not loaded!\n",
clevel);
}
/* return the offset of the super block in 512byte sectors */ staticinline sector_t calc_dev_sboffset(struct md_rdev *rdev)
{ return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
}
if (atomic_dec_and_test(&mddev->pending_writes))
wake_up(&mddev->sb_wait);
}
void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
sector_t sector, int size, struct page *page)
{ /* write first size bytes of page to sector of rdev * Increment mddev->pending_writes before returning * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error
*/ struct bio *bio;
int md_super_wait(struct mddev *mddev)
{ /* wait for all superblock writes that were scheduled to complete */
wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) return -EAGAIN; return 0;
}
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op)
{ struct bio bio; struct bio_vec bvec;
for (i = 0; i < MD_SB_BYTES/4 ; i++)
newcsum += sb32[i];
csum = (newcsum & 0xffffffff) + (newcsum>>32);
#ifdef CONFIG_ALPHA /* This used to use csum_partial, which was wrong for several * reasons including that different results are returned on * different architectures. It isn't critical that we get exactly * the same return value as before (we always csum_fold before * testing, and that removes any differences). However as we * know that csum_partial always returned a 16bit value on * alphas, do a fold to maximise conformity to previous behaviour.
*/
sb->sb_csum = md_csum_fold(disk_csum); #else
sb->sb_csum = disk_csum; #endif return csum;
}
/* * Handle superblock details. * We want to be able to handle multiple superblock formats * so we have a common interface to them all, and an array of * different handlers. * We rely on user-space to write the initial superblock, and support * reading and updating of superblocks. * Interface methods are: * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) * loads and validates a superblock on dev. * if refdev != NULL, compare superblocks on both devices * Return: * 0 - dev has a superblock that is compatible with refdev * 1 - dev has a superblock that is compatible and newer than refdev * so dev should be used as the refdev in future * -EINVAL superblock incompatible or invalid * -othererror e.g. -EIO * * int validate_super(struct mddev *mddev, struct md_rdev *dev) * Verify that dev is acceptable into mddev. * The first time, mddev->raid_disks will be 0, and data from * dev should be merged in. Subsequent calls check that dev * is new enough. Return 0 or -EINVAL * * void sync_super(struct mddev *mddev, struct md_rdev *dev) * Update the superblock for rdev with data in mddev * This does not write to disc. *
*/
/* * Check that the given mddev has no bitmap. * * This function is called from the run method of all personalities that do not * support bitmaps. It prints an error message and returns non-zero if mddev * has a bitmap. Otherwise, it returns 0. *
*/ int md_check_no_bitmap(struct mddev *mddev)
{ if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0;
pr_warn("%s: bitmaps are not supported for %s\n",
mdname(mddev), mddev->pers->head.name); return 1;
}
EXPORT_SYMBOL(md_check_no_bitmap);
/* * load_super for 0.90.0
*/ staticint super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
{
mdp_super_t *sb; int ret; bool spare_disk = true;
/* * Calculate the position of the superblock (512byte sectors), * it's at the end of the disk. * * It also happens to be a multiple of 4Kb.
*/
rdev->sb_start = calc_dev_sboffset(rdev);
ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret;
ret = -EINVAL;
sb = page_address(rdev->sb_page);
if (sb->md_magic != MD_SB_MAGIC) {
pr_warn("md: invalid raid superblock magic on %pg\n",
rdev->bdev); goto abort;
}
if (sb->major_version != 0 ||
sb->minor_version < 90 ||
sb->minor_version > 91) {
pr_warn("Bad version number %d.%d on %pg\n",
sb->major_version, sb->minor_version, rdev->bdev); goto abort;
}
if (sb->raid_disks <= 0) goto abort;
if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); goto abort;
}
/* not spare disk */ if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS &&
sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
spare_disk = false;
if (!refdev) { if (!spare_disk)
ret = 1; else
ret = 0;
} else {
__u64 ev1, ev2;
mdp_super_t *refsb = page_address(refdev->sb_page); if (!md_uuid_equal(refsb, sb)) {
pr_warn("md: %pg has different UUID to %pg\n",
rdev->bdev, refdev->bdev); goto abort;
} if (!md_sb_equal(refsb, sb)) {
pr_warn("md: %pg has same UUID but different superblock to %pg\n",
rdev->bdev, refdev->bdev); goto abort;
}
ev1 = md_event(sb);
ev2 = md_event(refsb);
if (!spare_disk && ev1 > ev2)
ret = 1; else
ret = 0;
}
rdev->sectors = rdev->sb_start; /* Limit to 4TB as metadata cannot record more than that. * (not needed for Linear and RAID0 as metadata doesn't * record this size)
*/ if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
rdev->sectors = (sector_t)(2ULL << 32) - 2;
if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */
ret = -EINVAL;
} elseif (mddev->pers == NULL) { /* Insist on good event counter while assembling, except
* for spares (which don't need an event count) */
++ev1; if (sb->disks[rdev->desc_nr].state & (
(1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) if (ev1 < mddev->events) return -EINVAL;
} elseif (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an * older device ... but not too old.
*/ if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
} else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0;
}
desc = sb->disks + rdev->desc_nr;
if (desc->state & (1<<MD_DISK_FAULTY))
set_bit(Faulty, &rdev->flags); elseif (desc->state & (1<<MD_DISK_SYNC)) {
set_bit(In_sync, &rdev->flags);
rdev->raid_disk = desc->raid_disk;
rdev->saved_raid_disk = desc->raid_disk;
} elseif (desc->state & (1<<MD_DISK_ACTIVE)) { /* active but not in sync implies recovery up to * reshape position. We don't know exactly where * that is, so set to zero for now
*/ if (mddev->minor_version >= 91) {
rdev->recovery_offset = 0;
rdev->raid_disk = desc->raid_disk;
}
} if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
set_bit(WriteMostly, &rdev->flags); if (desc->state & (1<<MD_DISK_FAILFAST))
set_bit(FailFast, &rdev->flags); return 0;
}
/* make rdev->sb match mddev data.. * * 1/ zero out disks * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); * 3/ any empty disks < next_spare become removed * * disks[0] gets initialised to REMOVED because * we cannot be sure from other fields if it has * been initialised or not.
*/ int i; int active=0, working=0,failed=0,spare=0,nr_disks=0;
staticint super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
{ struct mdp_superblock_1 *sb; int ret;
sector_t sb_start;
sector_t sectors; int bmask; bool spare_disk = true;
/* * Calculate the position of the superblock in 512byte sectors. * It is always aligned to a 4K boundary and * depeding on minor_version, it can be: * 0: At least 8K, but less than 12K, from end of device * 1: At start of device * 2: 4K from start of device.
*/ switch(minor_version) { case 0:
sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
sb_start &= ~(sector_t)(4*2-1); break; case 1:
sb_start = 0; break; case 2:
sb_start = 8; break; default: return -EINVAL;
}
rdev->sb_start = sb_start;
/* superblock is rarely larger than 1K, but it can be larger, * and it is safe to read 4k, so we do that
*/
ret = read_disk_sb(rdev, 4096); if (ret) return ret;
if (calc_sb_1_csum(sb) != sb->sb_csum) {
pr_warn("md: invalid superblock checksum on %pg\n",
rdev->bdev); return -EINVAL;
} if (le64_to_cpu(sb->data_size) < 10) {
pr_warn("md: data_size too small on %pg\n",
rdev->bdev); return -EINVAL;
} if (sb->pad0 ||
sb->pad3[0] ||
memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) /* Some padding is non-zero, might be a new feature */ return -EINVAL;
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
mddev->bitmap_info.file == NULL) {
mddev->bitmap_info.offset =
(__s32)le32_to_cpu(sb->bitmap_offset); /* Metadata doesn't record how much space is available. * For 1.0, we assume we can use up to the superblock * if before, else to 4K beyond superblock. * For others, assume no change is possible.
*/ if (mddev->minor_version > 0)
mddev->bitmap_info.space = 0; elseif (mddev->bitmap_info.offset > 0)
mddev->bitmap_info.space =
8 - mddev->bitmap_info.offset; else
mddev->bitmap_info.space =
-mddev->bitmap_info.offset;
}
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
if (le32_to_cpu(sb->feature_map) &
(MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { if (le32_to_cpu(sb->feature_map) &
(MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) return -EINVAL; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
(le32_to_cpu(sb->feature_map) &
MD_FEATURE_MULTIPLE_PPLS)) return -EINVAL;
set_bit(MD_HAS_PPL, &mddev->flags);
}
} elseif (mddev->pers == NULL) { /* Insist of good event counter while assembling, except for * spares (which don't need an event count). * Similar to mdadm, we allow event counter difference of 1 * from the freshest device.
*/ if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) if (ev1 + 1 < mddev->events) return -EINVAL;
} elseif (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an * older device, but not too old.
*/ if (ev1 < md_bitmap_events_cleared(mddev)) return 0; if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
} else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ return 0;
}
if (rdev->desc_nr < 0 ||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1;
} elseif (mddev->pers == NULL && freshest && ev1 < mddev->events) { /* * If we are assembling, and our event counter is smaller than the * highest event counter, we cannot trust our superblock about the role. * It could happen that our rdev was marked as Faulty, and all other * superblocks were updated with +1 event counter. * Then, before the next superblock update, which typically happens when * remove_and_add_spares() removes the device from the array, there was * a crash or reboot. * If we allow current rdev without consulting the freshest superblock, * we could cause data corruption. * Note that in this case our event counter is smaller by 1 than the * highest, otherwise, this rdev would not be allowed into array; * both kernel and mdadm allow event counter difference of 1.
*/ struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
if (rdev->desc_nr >= freshest_max_dev) { /* this is unexpected, better not proceed */
pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
mdname(mddev), rdev->bdev, rdev->desc_nr,
freshest->bdev, freshest_max_dev); return -EUCLEAN;
}
role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
mdname(mddev), rdev->bdev, role, role, freshest->bdev);
} else {
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
} switch (role) { case MD_DISK_ROLE_SPARE: /* spare */ break; case MD_DISK_ROLE_FAULTY: /* faulty */
set_bit(Faulty, &rdev->flags); break; case MD_DISK_ROLE_JOURNAL: /* journal device */ if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { /* journal device without journal feature */
pr_warn("md: journal device provided without journal feature, ignoring the device\n"); return -EINVAL;
}
set_bit(Journal, &rdev->flags);
rdev->journal_tail = le64_to_cpu(sb->journal_tail);
rdev->raid_disk = 0; break; default:
rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_OFFSET)) {
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); if (!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RECOVERY_BITMAP))
rdev->saved_raid_disk = -1;
} else { /* * If the array is FROZEN, then the device can't * be in_sync with rest of array.
*/ if (!test_bit(MD_RECOVERY_FROZEN,
&mddev->recovery))
set_bit(In_sync, &rdev->flags);
}
rdev->raid_disk = role; break;
} if (sb->devflags & WriteMostly1)
set_bit(WriteMostly, &rdev->flags); if (sb->devflags & FailFast1)
set_bit(FailFast, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
return 0;
}
staticvoid super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
{ struct mdp_superblock_1 *sb; struct md_rdev *rdev2; int max_dev, i; /* make rdev->sb match mddev and rdev data. */
/* if the device is bigger than 8Gig, save 64k for bitmap * usage, if bigger than 200Gig, save 128k
*/ if (dev_size < 64*2)
bm_space = 0; elseif (dev_size - 64*2 >= 200*1024*1024*2)
bm_space = 128*2; elseif (dev_size - 4*2 > 8*1024*1024*2)
bm_space = 64*2; else
bm_space = 4*2; return bm_space;
}
staticunsignedlonglong
super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
{ struct mdp_superblock_1 *sb;
sector_t max_sectors; if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ if (rdev->data_offset != rdev->new_data_offset) return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */
max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors;
} elseif (rdev->mddev->bitmap_info.offset) { /* minor version 0 with bitmap we can't move */ return 0;
} else { /* minor version 0; superblock after data */
sector_t sb_start, bm_space;
sector_t dev_size = bdev_nr_sectors(rdev->bdev);
/* 8K is for superblock */
sb_start = dev_size - 8*2;
sb_start &= ~(sector_t)(4*2 - 1);
bm_space = super_1_choose_bm_space(dev_size);
/* Space that can be used to store date needs to decrease * superblock bitmap space and bad block space(4K)
*/
max_sectors = sb_start - bm_space - 4*2;
staticint
super_1_allow_new_offset(struct md_rdev *rdev, unsignedlonglong new_offset)
{ /* All necessary checks on new >= old have been done */ if (new_offset >= rdev->data_offset) return 1;
/* with 1.0 metadata, there is no metadata to tread on
* so we can always move back */ if (rdev->mddev->minor_version == 0) return 1;
/* otherwise we must be sure not to step on * any metadata, so stay: * 36K beyond start of superblock * beyond end of badblocks * beyond write-intent bitmap
*/ if (rdev->sb_start + (32+4)*2 > new_offset) return 0;
if (!rdev->mddev->bitmap_info.file) { struct mddev *mddev = rdev->mddev; struct md_bitmap_stats stats; int err;
/* * Try to register data integrity profile for an mddev * * This is called when an array is started and after a disk has been kicked * from the array. It only succeeds if all working and active component devices * are integrity capable with matching profiles.
*/ int md_integrity_register(struct mddev *mddev)
{ if (list_empty(&mddev->disks)) return 0; /* nothing to do */ if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) return 0; /* shouldn't register */
pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); return 0;
}
EXPORT_SYMBOL(md_integrity_register);
/* prevent duplicates */ if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST;
if (rdev_read_only(rdev) && mddev->pers) return -EROFS;
/* make sure rdev->sectors exceeds mddev->dev_sectors */ if (!test_bit(Journal, &rdev->flags) &&
rdev->sectors &&
(mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care * about aligning sizes (e.g. linear)
*/ if (mddev->level > 0) return -ENOSPC;
} else
mddev->dev_sectors = rdev->sectors;
}
/* Verify rdev->desc_nr is unique. * If it is -1, assign a free number, else * check number is not in use
*/
rcu_read_lock(); if (rdev->desc_nr < 0) { int choice = 0; if (mddev->pers)
choice = mddev->raid_disks; while (md_find_rdev_nr_rcu(mddev, choice))
choice++;
rdev->desc_nr = choice;
} else { if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
rcu_read_unlock(); return -EBUSY;
}
}
rcu_read_unlock(); if (!test_bit(Journal, &rdev->flags) &&
mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
pr_warn("md: %s: array is limited to %d devices\n",
mdname(mddev), mddev->max_disks); return -EBUSY;
}
snprintf(b, sizeof(b), "%pg", rdev->bdev);
strreplace(b, '/', '!');
/* * kobject_del() will wait for all in progress writers to be done, where * reconfig_mutex is held, hence it can't be called under * reconfig_mutex and it's delayed to mddev_unlock().
*/
list_add(&rdev->same_set, &mddev->deleting);
}
staticbool set_in_sync(struct mddev *mddev)
{
lockdep_assert_held(&mddev->lock); if (!mddev->in_sync) {
mddev->sync_checkers++;
spin_unlock(&mddev->lock);
percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
spin_lock(&mddev->lock); if (!mddev->in_sync &&
percpu_ref_is_zero(&mddev->writes_pending)) {
mddev->in_sync = 1; /* * Ensure ->in_sync is visible before we clear * ->sync_checkers.
*/
smp_mb();
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
sysfs_notify_dirent_safe(mddev->sysfs_state);
} if (--mddev->sync_checkers == 0)
percpu_ref_switch_to_percpu(&mddev->writes_pending);
} if (mddev->safemode == 1)
mddev->safemode = 0; return mddev->in_sync;
}
staticvoid sync_sbs(struct mddev *mddev, int nospares)
{ /* Update each superblock (in-memory image), but * if we are allowed to, skip spares which already * have the right event counter, or have one earlier * (which would mean they aren't being marked as dirty * with the rest of the array)
*/ struct md_rdev *rdev;
rdev_for_each(rdev, mddev) { if (rdev->sb_events == mddev->events ||
(nospares &&
rdev->raid_disk < 0 &&
rdev->sb_events+1 == mddev->events)) { /* Don't update this superblock */
rdev->sb_loaded = 2;
} else {
sync_super(mddev, rdev);
rdev->sb_loaded = 1;
}
}
}
/* Find a good rdev */
rdev_for_each(iter, mddev) if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
rdev = iter; break;
}
/* No good device found. */ if (!rdev) returnfalse;
sb = page_address(rdev->sb_page); /* Check if a device has become faulty or a spare become active */
rdev_for_each(rdev, mddev) {
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); /* Device activated? */ if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags)) returntrue; /* Device turned faulty? */ if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) returntrue;
}
/* Check if any mddev parameters have changed */ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
(mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
(mddev->layout != le32_to_cpu(sb->layout)) ||
(mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
(mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) returntrue;
returnfalse;
}
void md_update_sb(struct mddev *mddev, int force_change)
{ struct md_rdev *rdev; int sync_req; int nospares = 0; int any_badblocks_changed = 0; int ret = -1;
if (!md_is_rdwr(mddev)) { if (force_change)
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return;
}
repeat: if (mddev_is_clustered(mddev)) { if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
nospares = 1;
ret = mddev->cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0)
mddev->cluster_ops->metadata_update_cancel(mddev);
bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
BIT(MD_SB_CHANGE_DEVS) |
BIT(MD_SB_CHANGE_CLEAN)); return;
}
}
/* * First make sure individual recovery_offsets are correct * curr_resync_completed can only be used during recovery. * During reshape/resync it might use array-addresses rather * that device addresses.
*/
rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
mddev->curr_resync_completed > rdev->recovery_offset)
rdev->recovery_offset = mddev->curr_resync_completed;
if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) /* just a clean<-> dirty transition, possibly leave spares alone, * though if events isn't the right even/odd, we will have to do * spares after all
*/
nospares = 1; if (force_change)
nospares = 0; if (mddev->degraded) /* If the array is degraded, then skipping spares is both * dangerous and fairly pointless. * Dangerous because a device that was removed from the array * might have a event_count that still looks up-to-date, * so it can be re-added without a resync. * Pointless because if there are any spares to skip, * then a recovery will happen and soon that array won't * be degraded any more and the spare can go back to sleep then.
*/
nospares = 0;
sync_req = mddev->in_sync;
/* If this is just a dirty<->clean transition, and the array is clean
* and 'events' is odd, we can roll back to the previous clean state */ if (nospares
&& (mddev->in_sync && mddev->resync_offset == MaxSector)
&& mddev->can_decrease_events
&& mddev->events != 1) {
mddev->events--;
mddev->can_decrease_events = 0;
} else { /* otherwise we have to go forward and ... */
mddev->events ++;
mddev->can_decrease_events = nospares;
}
/* * This 64-bit counter should never wrap. * Either we are in around ~1 trillion A.C., assuming * 1 reboot per second, or we have a bug...
*/
WARN_ON(mddev->events == 0);
rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed)
any_badblocks_changed++; if (test_bit(Faulty, &rdev->flags))
set_bit(FaultRecorded, &rdev->flags);
}
} else
pr_debug("md: %pg (skipping faulty)\n",
rdev->bdev);
} if (md_super_wait(mddev) < 0) goto rewrite; /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
if (mddev_is_clustered(mddev) && ret == 0)
mddev->cluster_ops->metadata_update_finish(mddev);
if (mddev->in_sync != sync_req ||
!bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) /* have to write it out again */ goto repeat;
wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
sysfs_notify_dirent_safe(mddev->sysfs_completed);
rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags))
clear_bit(Blocked, &rdev->flags);
if (any_badblocks_changed)
ack_all_badblocks(&rdev->badblocks);
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
}
EXPORT_SYMBOL(md_update_sb);
if (!mddev->pers->hot_remove_disk || add_journal) { /* If there is hot_add_disk but no hot_remove_disk * then added disks for geometry changes, * and should be added immediately.
*/
super_types[mddev->major_version].
validate_super(mddev, NULL/*freshest*/, rdev);
err = mddev->pers->hot_add_disk(mddev, rdev); if (err) {
md_kick_rdev_from_array(rdev); return err;
}
}
sysfs_notify_dirent_safe(rdev->sysfs_state);
/* words written to sysfs files may, or may not, be \n terminated. * We want to accept with case. For this we use cmd_match.
*/ staticint cmd_match(constchar *cmd, constchar *str)
{ /* See if cmd, written into a sysfs file, matches * str. They must either be the same, or cmd can * have a trailing newline
*/ while (*cmd && *str && *cmd == *str) {
cmd++;
str++;
} if (*cmd == '\n')
cmd++; if (*str || *cmd) return 0; return 1;
}
if (test_bit(Faulty, &flags) ||
(!test_bit(ExternalBbl, &flags) &&
rdev->badblocks.unacked_exist))
len += sprintf(page+len, "faulty%s", sep); if (test_bit(In_sync, &flags))
len += sprintf(page+len, "in_sync%s", sep); if (test_bit(Journal, &flags))
len += sprintf(page+len, "journal%s", sep); if (test_bit(WriteMostly, &flags))
len += sprintf(page+len, "write_mostly%s", sep); if (test_bit(Blocked, &flags) ||
(rdev->badblocks.unacked_exist
&& !test_bit(Faulty, &flags)))
len += sprintf(page+len, "blocked%s", sep); if (!test_bit(Faulty, &flags) &&
!test_bit(Journal, &flags) &&
!test_bit(In_sync, &flags))
len += sprintf(page+len, "spare%s", sep); if (test_bit(WriteErrorSeen, &flags))
len += sprintf(page+len, "write_error%s", sep); if (test_bit(WantReplacement, &flags))
len += sprintf(page+len, "want_replacement%s", sep); if (test_bit(Replacement, &flags))
len += sprintf(page+len, "replacement%s", sep); if (test_bit(ExternalBbl, &flags))
len += sprintf(page+len, "external_bbl%s", sep); if (test_bit(FailFast, &flags))
len += sprintf(page+len, "failfast%s", sep);
if (len)
len -= strlen(sep);
return len+sprintf(page+len, "\n");
}
static ssize_t
state_store(struct md_rdev *rdev, constchar *buf, size_t len)
{ /* can write * faulty - simulates an error * remove - disconnects the device * writemostly - sets write_mostly * -writemostly - clears write_mostly * blocked - sets the Blocked flags * -blocked - clears the Blocked and possibly simulates an error * insync - sets Insync providing device isn't active * -insync - clear Insync for a device with a slot assigned, * so that it gets rebuilt based on bitmap * write_error - sets WriteErrorSeen * -write_error - clears WriteErrorSeen * {,-}failfast - set/clear FailFast
*/
err = 0;
} elseif (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
set_bit(In_sync, &rdev->flags);
err = 0;
} elseif (cmd_match(buf, "failfast")) {
set_bit(FailFast, &rdev->flags);
need_update_sb = true;
err = 0;
} elseif (cmd_match(buf, "-failfast")) {
clear_bit(FailFast, &rdev->flags);
need_update_sb = true;
err = 0;
} elseif (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags)) { if (rdev->mddev->pers == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->saved_raid_disk = rdev->raid_disk;
rdev->raid_disk = -1;
err = 0;
}
} elseif (cmd_match(buf, "write_error")) {
set_bit(WriteErrorSeen, &rdev->flags);
err = 0;
} elseif (cmd_match(buf, "-write_error")) {
clear_bit(WriteErrorSeen, &rdev->flags);
err = 0;
} elseif (cmd_match(buf, "want_replacement")) { /* Any non-spare device that is not a replacement can * become want_replacement at any time, but we then need to * check if recovery is needed.
*/ if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
err = 0;
} elseif (cmd_match(buf, "-want_replacement")) { /* Clearing 'want_replacement' is always allowed. * Once replacements starts it is too late though.
*/
err = 0;
clear_bit(WantReplacement, &rdev->flags);
} elseif (cmd_match(buf, "replacement")) { /* Can only set a device as a replacement when array has not * yet been started. Once running, replacement is automatic * from spares, or by assigning 'slot'.
*/ if (rdev->mddev->pers)
err = -EBUSY; else {
set_bit(Replacement, &rdev->flags);
err = 0;
}
} elseif (cmd_match(buf, "-replacement")) { /* Similarly, can only clear Replacement before start */ if (rdev->mddev->pers)
err = -EBUSY; else {
clear_bit(Replacement, &rdev->flags);
err = 0;
}
} elseif (cmd_match(buf, "re-add")) { if (!rdev->mddev->pers)
err = -EINVAL; elseif (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
rdev->saved_raid_disk >= 0) { /* clear_bit is performed _after_ all the devices * have their local Faulty bit cleared. If any writes * happen in the meantime in the local node, they * will land in the local bitmap, which will be synced * by this node eventually
*/ if (!mddev_is_clustered(rdev->mddev) ||
(err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) {
clear_bit(Faulty, &rdev->flags);
err = add_bound_rdev(rdev);
}
} else
err = -EBUSY;
} elseif (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
set_bit(ExternalBbl, &rdev->flags);
rdev->badblocks.shift = 0;
err = 0;
} elseif (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
clear_bit(ExternalBbl, &rdev->flags);
err = 0;
} if (need_update_sb)
md_update_sb(mddev, 1); if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state); return err ? err : len;
} staticstruct rdev_sysfs_entry rdev_state =
__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
static ssize_t
slot_store(struct md_rdev *rdev, constchar *buf, size_t len)
{ int slot; int err;
if (test_bit(Journal, &rdev->flags)) return -EBUSY; if (strncmp(buf, "none", 4)==0)
slot = -1; else {
err = kstrtouint(buf, 10, (unsignedint *)&slot); if (err < 0) return err; if (slot < 0) /* overflow */ return -ENOSPC;
} if (rdev->mddev->pers && slot == -1) { /* Setting 'slot' on an active array requires also * updating the 'rd%d' link, and communicating * with the personality with ->hot_*_disk. * For now we only support removing * failed/spare devices. This normally happens automatically, * but not when the metadata is externally managed.
*/ if (rdev->raid_disk == -1) return -EEXIST; /* personality does all needed checks */ if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL;
clear_bit(Blocked, &rdev->flags);
remove_and_add_spares(rdev->mddev, rdev); if (rdev->raid_disk >= 0) return -EBUSY;
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
} elseif (rdev->mddev->pers) { /* Activating a spare .. or possibly reactivating * if we ever get bitmaps working here.
*/ int err;
if (rdev->raid_disk != -1) return -EBUSY;
if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) return -EBUSY;
if (rdev->mddev->pers->hot_add_disk == NULL) return -EINVAL;
if (kstrtoull(buf, 10, &new_offset) < 0) return -EINVAL;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; if (new_offset == rdev->data_offset) /* reset is always permitted */
; elseif (new_offset > rdev->data_offset) { /* must not push array size beyond rdev_sectors */ if (new_offset - rdev->data_offset
+ mddev->dev_sectors > rdev->sectors) return -E2BIG;
} /* Metadata worries about other space details. */
/* decreasing the offset is inconsistent with a backwards * reshape.
*/ if (new_offset < rdev->data_offset &&
mddev->reshape_backwards) return -EINVAL; /* Increasing offset is inconsistent with forwards * reshape. reshape_direction should be set to * 'backwards' first.
*/ if (new_offset > rdev->data_offset &&
!mddev->reshape_backwards) return -EINVAL;
if (test_bit(Journal, &rdev->flags)) return -EBUSY; if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (rdev->data_offset != rdev->new_data_offset) return -EINVAL; /* too confusing */ if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) {
sectors = super_types[my_mddev->major_version].
rdev_size_change(rdev, sectors); if (!sectors) return -EBUSY;
} elseif (!sectors)
sectors = bdev_nr_sectors(rdev->bdev) -
rdev->data_offset; if (!my_mddev->pers->resize) /* Cannot change size for RAID0 or Linear etc */ return -EINVAL;
} if (sectors < my_mddev->dev_sectors) return -EINVAL; /* component must fit device */
rdev->sectors = sectors;
/* * Check that all other rdevs with the same bdev do not overlap. This * check does not provide a hard guarantee, it just helps avoid * dangerous mistakes.
*/ if (sectors > oldsectors && my_mddev->external &&
md_rdev_overlaps(rdev)) { /* * Someone else could have slipped in a size change here, but * doing so is just silly. We put oldsectors back because we * know it is safe, and trust userspace not to race with itself.
*/
rdev->sectors = oldsectors; return -EBUSY;
} return len;
}
/* sysfs access to bad-blocks list. * We present two files. * 'bad-blocks' lists sector numbers and lengths of ranges that * are recorded as bad. The list is truncated to fit within * the one-page limit of sysfs. * Writing "sector length" to this file adds an acknowledged * bad block list. * 'unacknowledged-bad-blocks' lists bad blocks that have not yet * been acknowledged. Writing to this file adds bad blocks * without acknowledging them. This is largely for testing.
*/ static ssize_t bb_show(struct md_rdev *rdev, char *page)
{ return badblocks_show(&rdev->badblocks, page, 0);
} static ssize_t bb_store(struct md_rdev *rdev, constchar *page, size_t len)
{ int rv = badblocks_store(&rdev->badblocks, page, len, 0); /* Maybe that ack was all we needed */ if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
wake_up(&rdev->blocked_wait); return rv;
} staticstruct rdev_sysfs_entry rdev_bad_blocks =
__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
/* Add space to store bad block list. * This reserves the space even on arrays where it cannot * be used - I wonder if that matters
*/ return badblocks_init(&rdev->badblocks, 0);
}
EXPORT_SYMBOL_GPL(md_rdev_init);
/* * Import a device. If 'super_format' >= 0, then sanity check the superblock * * mark the device faulty if: * * - the device is nonexistent (zero size) * - the device has no valid superblock * * a faulty rdev _never_ has rdev->sb set.
*/ staticstruct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
{ struct md_rdev *rdev;
sector_t size; int err;
rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) return ERR_PTR(-ENOMEM);
err = md_rdev_init(rdev); if (err) goto out_free_rdev;
err = alloc_disk_sb(rdev); if (err) goto out_clear_rdev;
rdev->bdev_file = bdev_file_open_by_dev(newdev,
BLK_OPEN_READ | BLK_OPEN_WRITE,
super_format == -2 ? &claim_rdev : rdev, NULL); if (IS_ERR(rdev->bdev_file)) {
pr_warn("md: could not open device unknown-block(%u,%u).\n",
MAJOR(newdev), MINOR(newdev));
err = PTR_ERR(rdev->bdev_file); goto out_clear_rdev;
}
rdev->bdev = file_bdev(rdev->bdev_file);
kobject_init(&rdev->kobj, &rdev_ktype);
size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; if (!size) {
pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
rdev->bdev);
err = -EINVAL; goto out_blkdev_put;
}
if (super_format >= 0) {
err = super_types[super_format].
load_super(rdev, NULL, super_minor); if (err == -EINVAL) {
pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
rdev->bdev,
super_format, super_minor); goto out_blkdev_put;
} if (err < 0) {
pr_warn("md: could not read %pg's sb, not importing!\n",
rdev->bdev); goto out_blkdev_put;
}
}
i = 0;
rdev_for_each_safe(rdev, tmp, mddev) { if (mddev->max_disks &&
(rdev->desc_nr >= mddev->max_disks ||
i > mddev->max_disks)) {
pr_warn("md: %s: %pg: only %d devices permitted\n",
mdname(mddev), rdev->bdev,
mddev->max_disks);
md_kick_rdev_from_array(rdev); continue;
} if (rdev != freshest) { if (super_types[mddev->major_version].
validate_super(mddev, freshest, rdev)) {
pr_warn("md: kicking non-fresh %pg from array!\n",
rdev->bdev);
md_kick_rdev_from_array(rdev); continue;
}
} if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) &&
!test_bit(Journal, &rdev->flags)) {
rdev->raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
}
}
return 0;
}
/* Read a fixed-point number. * Numbers in sysfs attributes should be in "standard" units where * possible, so time should be in seconds. * However we internally use a a much smaller unit such as * milliseconds or jiffies. * This function takes a decimal number with a possible fractional * component, and produces an integer which is the result of * multiplying that number by 10^'scale'. * all without any floating-point arithmetic.
*/ int strict_strtoul_scaled(constchar *cp, unsignedlong *res, int scale)
{ unsignedlong result = 0; long decimals = -1; while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { if (*cp == '.')
decimals = 0; elseif (decimals < scale) { unsignedint value;
value = *cp - '0';
result = result * 10 + value; if (decimals >= 0)
decimals++;
}
cp++;
} if (*cp == '\n')
cp++; if (*cp) return -EINVAL; if (decimals < 0)
decimals = 0;
*res = result * int_pow(10, scale - decimals); return 0;
}
/* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape * - old personality can be suspended * - new personality will access other array.
*/
rv = -EINVAL; if (!mddev->pers->quiesce) {
pr_warn("md: %s: %s does not support online personality change\n",
mdname(mddev), mddev->pers->head.name); goto out_unlock;
}
/* Now find the new personality */
memcpy(clevel, buf, slen); if (clevel[slen-1] == '\n')
slen--;
clevel[slen] = 0; if (kstrtol(clevel, 10, &level))
level = LEVEL_NONE;
if (request_module("md-%s", clevel) != 0)
request_module("md-level-%s", clevel);
pers = get_pers(level, clevel); if (!pers) {
rv = -EINVAL; goto out_unlock;
}
if (pers == mddev->pers) { /* Nothing to do! */
put_pers(pers);
rv = len; goto out_unlock;
} if (!pers->takeover) {
put_pers(pers);
pr_warn("md: %s: %s does not support personality takeover\n",
mdname(mddev), clevel);
rv = -EINVAL; goto out_unlock;
}
/* ->takeover must set new_* and/or delta_disks * if it succeeds, and may set them when it fails.
*/
priv = pers->takeover(mddev); if (IS_ERR(priv)) {
mddev->new_level = mddev->level;
mddev->new_layout = mddev->layout;
mddev->new_chunk_sectors = mddev->chunk_sectors;
mddev->raid_disks -= mddev->delta_disks;
mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
put_pers(pers);
pr_warn("md: %s: %s would not accept array\n",
mdname(mddev), clevel);
rv = PTR_ERR(priv); goto out_unlock;
}
/* Looks like we have a winner */
mddev_detach(mddev);
if (oldpers->sync_request == NULL &&
mddev->external) { /* We are converting from a no-redundancy array * to a redundancy array and metadata is managed * externally so we need to be sure that writes * won't block due to a need to transition * clean->dirty * until external management is started.
*/
mddev->in_sync = 0;
mddev->safemode_delay = 0;
mddev->safemode = 0;
}
oldpers->free(mddev, oldpriv);
if (oldpers->sync_request == NULL &&
pers->sync_request != NULL) { /* need to add the md_redundancy_group */ if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
pr_warn("md: cannot register extra attributes for %s\n",
mdname(mddev));
mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
} if (oldpers->sync_request != NULL &&
pers->sync_request == NULL) { /* need to remove the md_redundancy_group */ if (mddev->to_remove == NULL)
mddev->to_remove = &md_redundancy_group;
}
put_pers(oldpers);
rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk >= mddev->raid_disks)
rdev->new_raid_disk = -1; if (rdev->new_raid_disk == rdev->raid_disk) continue;
sysfs_unlink_rdev(mddev, rdev);
}
rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk == rdev->raid_disk) continue;
rdev->raid_disk = rdev->new_raid_disk; if (rdev->raid_disk < 0)
clear_bit(In_sync, &rdev->flags); else { if (sysfs_link_rdev(mddev, rdev))
pr_warn("md: cannot register rd%d for %s after level change\n",
rdev->raid_disk, mdname(mddev));
}
}
if (pers->sync_request == NULL) { /* this is now an array without redundancy, so * it must always be in_sync
*/
mddev->in_sync = 1;
timer_delete_sync(&mddev->safemode_timer);
}
pers->run(mddev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread)
md_update_sb(mddev, 1);
sysfs_notify_dirent_safe(mddev->sysfs_level);
md_new_event();
rv = len;
out_unlock:
mddev_unlock_and_resume(mddev); return rv;
}
/* * The array state can be: * * clear * No devices, no size, no level * Equivalent to STOP_ARRAY ioctl * inactive * May have some settings, but array is not active * all IO results in error * When written, doesn't tear down array, but just stops it * suspended (not supported yet) * All IO requests will block. The array can be reconfigured. * Writing this, if accepted, will block until array is quiescent * readonly * no resync can happen. no superblocks get written. * write requests fail * read-auto * like readonly, but behaves like 'clean' on a write request. * * clean - no pending writes, but otherwise active. * When written to inactive array, starts without resync * If a write request arrives then * if metadata is known, mark 'dirty' and switch to 'active'. * if not known, block and switch to write-pending * If written to an active array that has pending writes, then fails. * active * fully active: IO and resync can be happening. * When written to inactive array, starts with resync * * write-pending * clean, but writes are blocked waiting for 'active' to be written. * * active-idle * like active, but no writes have been seen for a while (100msec). * * broken * Array is failed. It's useful because mounted-arrays aren't stopped * when array is failed, so this state will at least alert the user that * something is wrong.
*/ enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
write_pending, active_idle, broken, bad_word}; staticchar *array_states[] = { "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", "write-pending", "active-idle", "broken", NULL };
staticint match_word(constchar *word, char **list)
{ int n; for (n=0; list[n]; n++) if (cmd_match(word, list[n])) break; return n;
}
if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { switch(mddev->ro) { case MD_RDONLY:
st = readonly; break; case MD_AUTO_READ:
st = read_auto; break; case MD_RDWR:
spin_lock(&mddev->lock); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
st = write_pending; elseif (mddev->in_sync)
st = clean; elseif (mddev->safemode)
st = active_idle; else
st = active;
spin_unlock(&mddev->lock);
}
if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
st = broken;
} else { if (list_empty(&mddev->disks) &&
mddev->raid_disks == 0 &&
mddev->dev_sectors == 0)
st = clear; else
st = inactive;
} return sprintf(page, "%s\n", array_states[st]);
}
static ssize_t
array_state_store(struct mddev *mddev, constchar *buf, size_t len)
{ int err = 0; enum array_state st = match_word(buf, array_states);
/* No lock dependent actions */ switch (st) { case suspended: /* not supported yet */ case write_pending: /* cannot be set */ case active_idle: /* cannot be set */ case broken: /* cannot be set */ case bad_word: return -EINVAL; case clear: case readonly: case inactive: case read_auto: if (!mddev->pers || !md_is_rdwr(mddev)) break; /* write sysfs will not open mddev and opener should be 0 */
err = mddev_set_closing_and_sync_blockdev(mddev, 0); if (err) return err; break; default: break;
}
if (mddev->pers && (st == active || st == clean) &&
mddev->ro != MD_RDONLY) { /* don't take reconfig_mutex when toggling between * clean and active
*/
spin_lock(&mddev->lock); if (st == active) {
restart_array(mddev);
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
wake_up(&mddev->sb_wait);
} else/* st == clean */ {
restart_array(mddev); if (!set_in_sync(mddev))
err = -EBUSY;
} if (!err)
sysfs_notify_dirent_safe(mddev->sysfs_state);
spin_unlock(&mddev->lock); return err ?: len;
}
err = mddev_lock(mddev); if (err) return err;
switch (st) { case inactive: /* stop an active array, return 0 otherwise */ if (mddev->pers)
err = do_md_stop(mddev, 2); break; case clear:
err = do_md_stop(mddev, 0); break; case readonly: if (mddev->pers)
err = md_set_readonly(mddev); else {
mddev->ro = MD_RDONLY;
set_disk_ro(mddev->gendisk, 1);
err = do_md_run(mddev);
} break; case read_auto: if (mddev->pers) { if (md_is_rdwr(mddev))
err = md_set_readonly(mddev); elseif (mddev->ro == MD_RDONLY)
err = restart_array(mddev); if (err == 0) {
mddev->ro = MD_AUTO_READ;
set_disk_ro(mddev->gendisk, 0);
}
} else {
mddev->ro = MD_AUTO_READ;
err = do_md_run(mddev);
} break; case clean: if (mddev->pers) {
err = restart_array(mddev); if (err) break;
spin_lock(&mddev->lock); if (!set_in_sync(mddev))
err = -EBUSY;
spin_unlock(&mddev->lock);
} else
err = -EINVAL; break; case active: if (mddev->pers) {
err = restart_array(mddev); if (err) break;
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
wake_up(&mddev->sb_wait);
err = 0;
} else {
mddev->ro = MD_RDWR;
set_disk_ro(mddev->gendisk, 0);
err = do_md_run(mddev);
} break; default:
err = -EINVAL; break;
}
if (!err) { if (mddev->hold_active == UNTIL_IOCTL)
mddev->hold_active = 0;
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
mddev_unlock(mddev);
if (st == readonly || st == read_auto || st == inactive ||
(err && st == clear))
clear_bit(MD_CLOSING, &mddev->flags);
static ssize_t
new_dev_store(struct mddev *mddev, constchar *buf, size_t len)
{ /* buf must be %d:%d\n? giving major and minor numbers */ /* The new device is added to the array. * If the array has a persistent superblock, we read the * superblock to initialise info and check validity. * Otherwise, only checking done is that in bind_rdev_to_array, * which mainly checks size.
*/ char *e; int major = simple_strtoul(buf, &e, 10); int minor;
dev_t dev; struct md_rdev *rdev; int err;
if (!*buf || *e != ':' || !e[1] || e[1] == '\n') return -EINVAL;
minor = simple_strtoul(e+1, &e, 10); if (*e && *e != '\n') return -EINVAL;
dev = MKDEV(major, minor); if (major != MAJOR(dev) ||
minor != MINOR(dev)) return -EOVERFLOW;
static ssize_t
size_store(struct mddev *mddev, constchar *buf, size_t len)
{ /* If array is inactive, we can reduce the component size, but * not increase it (except from 0). * If array is active, we can try an on-line resize
*/
sector_t sectors; int err = strict_blocks_to_sectors(buf, §ors);
/* Metadata version. * This is one of * 'none' for arrays with no metadata (good luck...) * 'external' for arrays with externally managed metadata, * or N.M for internally known formats
*/ static ssize_t
metadata_show(struct mddev *mddev, char *page)
{ if (mddev->persistent) return sprintf(page, "%d.%d\n",
mddev->major_version, mddev->minor_version); elseif (mddev->external) return sprintf(page, "external:%s\n", mddev->metadata_type); else return sprintf(page, "none\n");
}
static ssize_t
metadata_store(struct mddev *mddev, constchar *buf, size_t len)
{ int major, minor; char *e; int err; /* Changing the details of 'external' metadata is * always permitted. Otherwise there must be * no devices attached to the array.
*/
/* * frozen has the highest priority, means running sync_thread will be * stopped immediately, and no new sync_thread can start.
*/ if (test_bit(MD_RECOVERY_FROZEN, &recovery)) return ACTION_FROZEN;
/* * read-only array can't register sync_thread, and it can only * add/remove spares.
*/ if (!md_is_rdwr(mddev)) return ACTION_IDLE;
/* * idle means no sync_thread is running, and no new sync_thread is * requested.
*/ if (!test_bit(MD_RECOVERY_RUNNING, &recovery) &&
!test_bit(MD_RECOVERY_NEEDED, &recovery)) return ACTION_IDLE;
/* * Check if any sync operation (resync/recover/reshape) is * currently active. This ensures that only one sync operation * can run at a time. Returns the type of active operation, or * ACTION_IDLE if none are active.
*/
active_action = md_get_active_sync_action(mddev); if (active_action != ACTION_IDLE) return active_action;
if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return ACTION_RESHAPE;
if (test_bit(MD_RECOVERY_RECOVER, &recovery)) return ACTION_RECOVER;
if (test_bit(MD_RECOVERY_SYNC, &recovery)) { /* * MD_RECOVERY_CHECK must be paired with * MD_RECOVERY_REQUESTED.
*/ if (test_bit(MD_RECOVERY_CHECK, &recovery)) return ACTION_CHECK; if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) return ACTION_REPAIR; return ACTION_RESYNC;
}
/* * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no * sync_action is specified.
*/ return ACTION_IDLE;
}
/** * stop_sync_thread() - wait for sync_thread to stop if it's running. * @mddev: the array. * @locked: if set, reconfig_mutex will still be held after this function * return; if not set, reconfig_mutex will be released after this * function return.
*/ staticvoid stop_sync_thread(struct mddev *mddev, bool locked)
{ int sync_seq = atomic_read(&mddev->sync_seq);
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { if (!locked)
mddev_unlock(mddev); return;
}
mddev_unlock(mddev);
set_bit(MD_RECOVERY_INTR, &mddev->recovery); /* * Thread might be blocked waiting for metadata update which will now * never happen
*/
md_wakeup_thread_directly(mddev->sync_thread); if (work_pending(&mddev->sync_work))
flush_work(&mddev->sync_work);
staticint mddev_start_reshape(struct mddev *mddev)
{ int ret;
if (mddev->pers->start_reshape == NULL) return -EINVAL;
if (mddev->reshape_position == MaxSector ||
mddev->pers->check_reshape == NULL ||
mddev->pers->check_reshape(mddev)) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
ret = mddev->pers->start_reshape(mddev); if (ret) return ret;
} else { /* * If reshape is still in progress, and md_check_recovery() can * continue to reshape, don't restart reshape because data can * be corrupted for raid456.
*/
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL;
retry: if (work_busy(&mddev->sync_work))
flush_work(&mddev->sync_work);
ret = mddev_lock(mddev); if (ret) return ret;
if (work_busy(&mddev->sync_work)) {
mddev_unlock(mddev); goto retry;
}
action = md_sync_action_by_name(page);
/* TODO: mdadm rely on "idle" to start sync_thread. */ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { switch (action) { case ACTION_FROZEN:
md_frozen_sync_thread(mddev);
ret = len; goto out; case ACTION_IDLE:
md_idle_sync_thread(mddev); break; case ACTION_RESHAPE: case ACTION_RECOVER: case ACTION_CHECK: case ACTION_REPAIR: case ACTION_RESYNC:
ret = -EBUSY; goto out; default:
ret = -EINVAL; goto out;
}
} else { switch (action) { case ACTION_FROZEN:
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
ret = len; goto out; case ACTION_RESHAPE:
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
ret = mddev_start_reshape(mddev); if (ret) goto out; break; case ACTION_RECOVER:
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); break; case ACTION_CHECK:
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
fallthrough; case ACTION_REPAIR:
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
fallthrough; case ACTION_RESYNC: case ACTION_IDLE:
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); break; default:
ret = -EINVAL; goto out;
}
}
if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode
*/
mddev->ro = MD_RDWR;
md_wakeup_thread(mddev->sync_thread);
}
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
ret = len;
/* * Setting fail_last_dev to true to allow last device to be forcibly removed * from RAID1/RAID10.
*/ static ssize_t
fail_last_dev_store(struct mddev *mddev, constchar *buf, size_t len)
{ int ret; bool value;
ret = kstrtobool(buf, &value); if (ret) return ret;
if (value != mddev->fail_last_dev)
mddev->fail_last_dev = value;
if (legacy_async_del_gendisk) { if (mddev->sysfs_state)
sysfs_put(mddev->sysfs_state); if (mddev->sysfs_level)
sysfs_put(mddev->sysfs_level);
del_gendisk(mddev->gendisk);
}
put_disk(mddev->gendisk);
}
/* stack the limit for all rdevs into lim */ int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, unsignedint flags)
{ struct md_rdev *rdev;
/* apply the extra stacking limits from a new rdev into mddev */ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
{ struct queue_limits lim;
struct mddev *md_alloc(dev_t dev, char *name)
{ /* * If dev is zero, name is the name of a device to allocate with * an arbitrary minor number. It will be "md_???" * If dev is non-zero it must be a device number with a MAJOR of * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then * the device is being created by opening a node in /dev. * If "name" is not NULL, the device is being created by * writing to /sys/module/md_mod/parameters/new_array.
*/ static DEFINE_MUTEX(disks_mutex); struct mddev *mddev; struct gendisk *disk; int partitioned; int shift; int unit; int error;
/* * Wait for any previous instance of this device to be completely * removed (mddev_delayed_delete).
*/
flush_workqueue(md_misc_wq);
kobject_init(&mddev->kobj, &md_ktype);
error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); if (error) { /* * The disk is already live at this point. Clear the hold flag * and let mddev_put take care of the deletion, as it isn't any * different from a normal close on last release now.
*/
mddev->hold_active = 0;
mutex_unlock(&disks_mutex);
mddev_put(mddev); return ERR_PTR(error);
}
if (legacy_async_del_gendisk)
pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n");
if (IS_ERR(mddev)) return PTR_ERR(mddev);
mddev_put(mddev); return 0;
}
staticvoid md_probe(dev_t dev)
{ if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) return; if (create_on_open)
md_alloc_and_put(dev, NULL);
}
staticint add_named_array(constchar *val, conststruct kernel_param *kp)
{ /* * val must be "md_*" or "mdNNN". * For "md_*" we allocate an array with a large free minor number, and * set the name to val. val must not already be an active name. * For "mdNNN" we allocate an array with the minor number NNN * which must not already be in use.
*/ int len = strlen(val); char buf[DISK_NAME_LEN]; unsignedlong devnum;
mddev->safemode = 1; if (mddev->external)
sysfs_notify_dirent_safe(mddev->sysfs_state);
md_wakeup_thread(mddev->thread);
}
staticint start_dirty_degraded;
int md_run(struct mddev *mddev)
{ int err; struct md_rdev *rdev; struct md_personality *pers; bool nowait = true;
if (list_empty(&mddev->disks)) /* cannot run an array with no devices.. */ return -EINVAL;
if (mddev->pers) return -EBUSY; /* Cannot run until previous stop completes properly */ if (mddev->sysfs_active) return -EBUSY;
/* * Analyze all RAID superblock(s)
*/ if (!mddev->raid_disks) { if (!mddev->persistent) return -EINVAL;
err = analyze_sbs(mddev); if (err) return -EINVAL;
}
if (mddev->level != LEVEL_NONE)
request_module("md-level-%d", mddev->level); elseif (mddev->clevel[0])
request_module("md-%s", mddev->clevel);
/* * Drop all container device buffers, from now on * the only valid external interface is through the md * device.
*/
mddev->has_superblocks = false;
rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue;
sync_blockdev(rdev->bdev);
invalidate_bdev(rdev->bdev); if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
mddev->ro = MD_RDONLY; if (!mddev_is_dm(mddev))
set_disk_ro(mddev->gendisk, 1);
}
if (rdev->sb_page)
mddev->has_superblocks = true;
/* perform some consistency tests on the device. * We don't want the data to overlap the metadata, * Internal Bitmap issues have been handled elsewhere.
*/ if (rdev->meta_bdev) { /* Nothing to check */;
} elseif (rdev->data_offset < rdev->sb_start) { if (mddev->dev_sectors &&
rdev->data_offset + mddev->dev_sectors
> rdev->sb_start) {
pr_warn("md: %s: data overlaps metadata\n",
mdname(mddev)); return -EINVAL;
}
} else { if (rdev->sb_start + rdev->sb_size/512
> rdev->data_offset) {
pr_warn("md: %s: metadata overlaps data\n",
mdname(mddev)); return -EINVAL;
}
}
sysfs_notify_dirent_safe(rdev->sysfs_state);
nowait = nowait && bdev_nowait(rdev->bdev);
}
if (!bioset_initialized(&mddev->bio_set)) {
err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) return err;
} if (!bioset_initialized(&mddev->sync_set)) {
err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) goto exit_bio_set;
}
if (!bioset_initialized(&mddev->io_clone_set)) {
err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
offsetof(struct md_io_clone, bio_clone), 0); if (err) goto exit_sync_set;
}
if (pers->sync_request) { /* Warn if this is a potentially silly * configuration.
*/ struct md_rdev *rdev2; int warned = 0;
rdev_for_each(rdev, mddev)
rdev_for_each(rdev2, mddev) { if (rdev < rdev2 &&
rdev->bdev->bd_disk ==
rdev2->bdev->bd_disk) {
pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n",
mdname(mddev),
rdev->bdev,
rdev2->bdev);
warned = 1;
}
}
if (warned)
pr_warn("True protection against single-disk failure might be compromised.\n");
}
/* dm-raid expect sync_thread to be frozen until resume */ if (mddev->gendisk)
mddev->recovery = 0;
/* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors;
mddev->ok_start_degraded = start_dirty_degraded;
if (start_readonly && md_is_rdwr(mddev))
mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */
err = pers->run(mddev); if (err)
pr_warn("md: pers->run() failed ...\n"); elseif (pers->size(mddev, 0, 0) < mddev->array_sectors) {
WARN_ONCE(!mddev->external_size, "%s: default size too small, but 'external_size' not in effect?\n",
__func__);
pr_warn("md: invalid array_size %llu > default size %llu\n",
(unsignedlonglong)mddev->array_sectors / 2,
(unsignedlonglong)pers->size(mddev, 0, 0) / 2);
err = -EINVAL;
} if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
err = mddev->bitmap_ops->create(mddev); if (err)
pr_warn("%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
} if (err) goto bitmap_abort;
if (mddev->bitmap_info.max_write_behind > 0) { bool create_pool = false;
if (pers->sync_request) { if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &md_redundancy_group))
pr_warn("md: cannot register extra attributes for %s\n",
mdname(mddev));
mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
} elseif (mddev->ro == MD_AUTO_READ)
mddev->ro = MD_RDWR;
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0; if (mddev_is_clustered(mddev))
mddev->safemode_delay = 0; else
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
mddev->in_sync = 1;
smp_wmb();
spin_lock(&mddev->lock);
mddev->pers = pers;
spin_unlock(&mddev->lock);
rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0)
sysfs_link_rdev(mddev, rdev); /* failure here is OK */
if (mddev->degraded && md_is_rdwr(mddev)) /* This ensures that recovering status is reported immediately * via sysfs - until a lack of spares is confirmed.
*/
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
/* Complain if it has no devices */ if (list_empty(&mddev->disks)) return -ENXIO; if (!mddev->pers) return -EINVAL; if (md_is_rdwr(mddev)) return -EBUSY;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) { if (test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
has_journal = true; if (rdev_read_only(rdev))
has_readonly = true;
}
rcu_read_unlock(); if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) /* Don't restart rw with journal missing/faulty */ return -EINVAL; if (has_readonly) return -EROFS;
mddev->safemode = 0;
mddev->ro = MD_RDWR;
set_disk_ro(disk, 0);
pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); /* Kick recovery or resync if necessary */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_state); return 0;
}
staticvoid md_clean(struct mddev *mddev)
{
mddev->array_sectors = 0;
mddev->external_size = 0;
mddev->dev_sectors = 0;
mddev->raid_disks = 0;
mddev->resync_offset = 0;
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->reshape_position = MaxSector; /* we still need mddev->external in export_rdev, do not clear it yet */
mddev->persistent = 0;
mddev->level = LEVEL_NONE;
mddev->clevel[0] = 0;
/* * For legacy_async_del_gendisk mode, it can stop the array in the * middle of assembling it, then it still can access the array. So * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk, * it can't open the array again after stopping it. So it doesn't * clear MD_CLOSING.
*/ if (legacy_async_del_gendisk && mddev->hold_active) {
clear_bit(MD_CLOSING, &mddev->flags);
} else { /* if UNTIL_STOP is set, it's cleared here */
mddev->hold_active = 0; /* Don't clear MD_CLOSING, or mddev can be opened again. */
mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
}
mddev->sb_flags = 0;
mddev->ro = MD_RDWR;
mddev->metadata_type[0] = 0;
mddev->chunk_sectors = 0;
mddev->ctime = mddev->utime = 0;
mddev->layout = 0;
mddev->max_disks = 0;
mddev->events = 0;
mddev->can_decrease_events = 0;
mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
mddev->new_level = LEVEL_NONE;
mddev->new_layout = 0;
mddev->new_chunk_sectors = 0;
mddev->curr_resync = MD_RESYNC_NONE;
atomic64_set(&mddev->resync_mismatches, 0);
mddev->suspend_lo = mddev->suspend_hi = 0;
mddev->sync_speed_min = mddev->sync_speed_max = 0;
mddev->recovery = 0;
mddev->in_sync = 0;
mddev->changed = 0;
mddev->degraded = 0;
mddev->safemode = 0;
mddev->private = NULL;
mddev->cluster_info = NULL;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
mddev->bitmap_info.default_space = 0;
mddev->bitmap_info.chunksize = 0;
mddev->bitmap_info.daemon_sleep = 0;
mddev->bitmap_info.max_write_behind = 0;
mddev->bitmap_info.nodes = 0;
}
/* mode: * 0 - completely stop and dis-assemble array * 2 - stop but do not disassemble array
*/ staticint do_md_stop(struct mddev *mddev, int mode)
{ struct gendisk *disk = mddev->gendisk; struct md_rdev *rdev; int did_freeze = 0;
if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
did_freeze = 1;
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
stop_sync_thread(mddev, true);
if (mddev->sysfs_active ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
pr_warn("md: %s still in use.\n",mdname(mddev)); if (did_freeze) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} return -EBUSY;
} if (mddev->pers) { if (!md_is_rdwr(mddev))
set_disk_ro(disk, 0);
__md_stop_writes(mddev);
__md_stop(mddev);
/* tell userspace to handle 'inactive' */
sysfs_notify_dirent_safe(mddev->sysfs_state);
rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0)
sysfs_unlink_rdev(mddev, rdev);
err = do_md_run(mddev); if (err) {
pr_warn("md: do_md_run() returned %d\n", err);
do_md_stop(mddev, 0);
}
}
/* * lets try to run arrays based on all disks that have arrived * until now. (those are in pending_raid_disks) * * the method: pick the first pending disk, collect all disks with * the same UUID, remove all from the pending list and put them into * the 'same_array' list. Then order this list based on superblock * update time (freshest comes first), kick out 'old' disks and * compare superblocks. If everything's fine then run it. * * If "unit" is allocated, then bump its reference count
*/ staticvoid autorun_devices(int part)
{ struct md_rdev *rdev0, *rdev, *tmp; struct mddev *mddev;
pr_info("md: autorun ...\n"); while (!list_empty(&pending_raid_disks)) { int unit;
dev_t dev;
LIST_HEAD(candidates);
rdev0 = list_entry(pending_raid_disks.next, struct md_rdev, same_set);
pr_debug("md: considering %pg ...\n", rdev0->bdev);
INIT_LIST_HEAD(&candidates);
rdev_for_each_list(rdev, tmp, &pending_raid_disks) if (super_90_load(rdev, rdev0, 0) >= 0) {
pr_debug("md: adding %pg ...\n",
rdev->bdev);
list_move(&rdev->same_set, &candidates);
} /* * now we have a set of devices, with all of them having * mostly sane superblocks. It's time to allocate the * mddev.
*/ if (part) {
dev = MKDEV(mdp_major,
rdev0->preferred_minor << MdpMinorShift);
unit = MINOR(dev) >> MdpMinorShift;
} else {
dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
unit = MINOR(dev);
} if (rdev0->preferred_minor != unit) {
pr_warn("md: unit number in %pg is bad: %d\n",
rdev0->bdev, rdev0->preferred_minor); break;
}
mddev = md_alloc(dev, NULL); if (IS_ERR(mddev)) break;
if (mddev_suspend_and_lock(mddev))
pr_warn("md: %s locked, cannot run\n", mdname(mddev)); elseif (mddev->raid_disks || mddev->major_version
|| !list_empty(&mddev->disks)) {
pr_warn("md: %s already running, cannot run %pg\n",
mdname(mddev), rdev0->bdev);
mddev_unlock_and_resume(mddev);
} else {
pr_debug("md: created %s\n", mdname(mddev));
mddev->persistent = 1;
rdev_for_each_list(rdev, tmp, &candidates) {
list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev))
export_rdev(rdev, mddev);
}
autorun_array(mddev);
mddev_unlock_and_resume(mddev);
} /* on success, candidates will be empty, on error * it won't...
*/
rdev_for_each_list(rdev, tmp, &candidates) {
list_del_init(&rdev->same_set);
export_rdev(rdev, mddev);
}
mddev_put(mddev);
}
pr_info("md: ... autorun DONE.\n");
} #endif/* !MODULE */
if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) return -EOVERFLOW;
if (!mddev->raid_disks) { int err; /* expecting a device which has a superblock */
rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (IS_ERR(rdev)) {
pr_warn("md: md_import_device returned %ld\n",
PTR_ERR(rdev)); return PTR_ERR(rdev);
} if (!list_empty(&mddev->disks)) { struct md_rdev *rdev0
= list_entry(mddev->disks.next, struct md_rdev, same_set);
err = super_types[mddev->major_version]
.load_super(rdev, rdev0, mddev->minor_version); if (err < 0) {
pr_warn("md: %pg has different UUID to %pg\n",
rdev->bdev,
rdev0->bdev);
export_rdev(rdev, mddev); return -EINVAL;
}
}
err = bind_rdev_to_array(rdev, mddev); if (err)
export_rdev(rdev, mddev); return err;
}
/* * md_add_new_disk can be used once the array is assembled * to add "hot spares". They must already have a superblock * written
*/ if (mddev->pers) { int err; if (!mddev->pers->hot_add_disk) {
pr_warn("%s: personality does not support diskops!\n",
mdname(mddev)); return -EINVAL;
} if (mddev->persistent)
rdev = md_import_device(dev, mddev->major_version,
mddev->minor_version); else
rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) {
pr_warn("md: md_import_device returned %ld\n",
PTR_ERR(rdev)); return PTR_ERR(rdev);
} /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<<MD_DISK_SYNC) &&
info->raid_disk < mddev->raid_disks) {
rdev->raid_disk = info->raid_disk;
clear_bit(Bitmap_sync, &rdev->flags);
} else
rdev->raid_disk = -1;
rdev->saved_raid_disk = rdev->raid_disk;
} else
super_types[mddev->major_version].
validate_super(mddev, NULL/*freshest*/, rdev); if ((info->state & (1<<MD_DISK_SYNC)) &&
rdev->raid_disk != info->raid_disk) { /* This was a hot-add request, but events doesn't * match, so reject it.
*/
export_rdev(rdev, mddev); return -EINVAL;
}
clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<<MD_DISK_WRITEMOSTLY))
set_bit(WriteMostly, &rdev->flags); else
clear_bit(WriteMostly, &rdev->flags); if (info->state & (1<<MD_DISK_FAILFAST))
set_bit(FailFast, &rdev->flags); else
clear_bit(FailFast, &rdev->flags);
if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { if (!err) {
err = mddev->cluster_ops->new_disk_ack(
mddev, err == 0); if (err)
md_kick_rdev_from_array(rdev);
}
} else { if (err)
mddev->cluster_ops->add_new_disk_cancel(mddev); else
err = add_bound_rdev(rdev);
}
} elseif (!err)
err = add_bound_rdev(rdev);
return err;
}
/* otherwise, md_add_new_disk is only allowed * for major_version==0 superblocks
*/ if (mddev->major_version != 0) {
pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); return -EINVAL;
}
if (!(info->state & (1<<MD_DISK_FAULTY))) { int err;
rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) {
pr_warn("md: error, md_import_device() returned %ld\n",
PTR_ERR(rdev)); return PTR_ERR(rdev);
}
rdev->desc_nr = info->number; if (info->raid_disk < mddev->raid_disks)
rdev->raid_disk = info->raid_disk; else
rdev->raid_disk = -1;
if (rdev->raid_disk < mddev->raid_disks) if (info->state & (1<<MD_DISK_SYNC))
set_bit(In_sync, &rdev->flags);
if (info->state & (1<<MD_DISK_WRITEMOSTLY))
set_bit(WriteMostly, &rdev->flags); if (info->state & (1<<MD_DISK_FAILFAST))
set_bit(FailFast, &rdev->flags);
if (mddev->major_version != 0) {
pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
mdname(mddev)); return -EINVAL;
} if (!mddev->pers->hot_add_disk) {
pr_warn("%s: personality does not support diskops!\n",
mdname(mddev)); return -EINVAL;
}
rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) {
pr_warn("md: error, md_import_device() returned %ld\n",
PTR_ERR(rdev)); return -EINVAL;
}
if (mddev->persistent)
rdev->sb_start = calc_dev_sboffset(rdev); else
rdev->sb_start = bdev_nr_sectors(rdev->bdev);
rdev->sectors = rdev->sb_start;
if (test_bit(Faulty, &rdev->flags)) {
pr_warn("md: can not hot-add faulty %pg disk to %s!\n",
rdev->bdev, mdname(mddev));
err = -EINVAL; goto abort_export;
}
/* * The rest should better be atomic, we can have disk failures * noticed in interrupt contexts ...
*/
rdev->raid_disk = -1;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread)
md_update_sb(mddev, 1); /* * Kick recovery, maybe this spare has to be added to the * array immediately.
*/
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_new_event(); return 0;
staticint set_bitmap_file(struct mddev *mddev, int fd)
{ int err = 0;
if (mddev->pers) { if (!mddev->pers->quiesce || !mddev->thread) return -EBUSY; if (mddev->recovery || mddev->sync_thread) return -EBUSY; /* we should be able to change the bitmap.. */
}
if (mddev->bitmap || mddev->bitmap_info.file) return -EEXIST; /* cannot add when bitmap is present */
if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
pr_warn("%s: bitmap files not supported by this kernel\n",
mdname(mddev)); return -EINVAL;
}
pr_warn("%s: using deprecated bitmap file support\n",
mdname(mddev));
f = fget(fd);
if (f == NULL) {
pr_warn("%s: error: failed to get bitmap file\n",
mdname(mddev)); return -EBADF;
}
inode = f->f_mapping->host; if (!S_ISREG(inode->i_mode)) {
pr_warn("%s: error: bitmap file must be a regular file\n",
mdname(mddev));
err = -EBADF;
} elseif (!(f->f_mode & FMODE_WRITE)) {
pr_warn("%s: error: bitmap file must open for write\n",
mdname(mddev));
err = -EBADF;
} elseif (atomic_read(&inode->i_writecount) != 1) {
pr_warn("%s: error: bitmap file is already in use\n",
mdname(mddev));
err = -EBUSY;
} if (err) {
fput(f); return err;
}
mddev->bitmap_info.file = f;
mddev->bitmap_info.offset = 0; /* file overrides offset */
} elseif (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */
err = 0; if (mddev->pers) { if (fd >= 0) {
err = mddev->bitmap_ops->create(mddev); if (!err)
err = mddev->bitmap_ops->load(mddev);
/* * md_set_array_info is used two different ways * The original usage is when creating a new array. * In this usage, raid_disks is > 0 and it together with * level, size, not_persistent,layout,chunksize determine the * shape of the array. * This will always create an array with a type-0.90.0 superblock. * The newer usage is when assembling an array. * In this case raid_disks will be 0, and the major_version field is * use to determine which style super-blocks are to be found on the devices. * The minor and patch _version numbers are also kept incase the * super_block handler wishes to interpret them.
*/ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
{ if (info->raid_disks == 0) { /* just setting version number for superblock loading */ if (info->major_version < 0 ||
info->major_version >= ARRAY_SIZE(super_types) ||
super_types[info->major_version].name == NULL) { /* maybe try to auto-load a module? */
pr_warn("md: superblock version %d not known\n",
info->major_version); return -EINVAL;
}
mddev->major_version = info->major_version;
mddev->minor_version = info->minor_version;
mddev->patch_version = info->patch_version;
mddev->persistent = !info->not_persistent; /* ensure mddev_put doesn't delete this now that there * is some minimal configuration.
*/
mddev->ctime = ktime_get_real_seconds(); return 0;
}
mddev->major_version = MD_MAJOR_VERSION;
mddev->minor_version = MD_MINOR_VERSION;
mddev->patch_version = MD_PATCHLEVEL_VERSION;
mddev->ctime = ktime_get_real_seconds();
mddev->level = info->level;
mddev->clevel[0] = 0;
mddev->dev_sectors = 2 * (sector_t)info->size;
mddev->raid_disks = info->raid_disks; /* don't set md_minor, it is determined by which /dev/md* was * openned
*/ if (info->state & (1<<MD_SB_CLEAN))
mddev->resync_offset = MaxSector; else
mddev->resync_offset = 0;
mddev->persistent = ! info->not_persistent;
mddev->external = 0;
mddev->layout = info->layout; if (mddev->level == 0) /* Cannot trust RAID0 layout info here */
mddev->layout = -1;
mddev->chunk_sectors = info->chunk_size >> 9;
staticint update_size(struct mddev *mddev, sector_t num_sectors)
{ struct md_rdev *rdev; int rv; int fit = (num_sectors == 0);
sector_t old_dev_sectors = mddev->dev_sectors;
if (mddev->pers->resize == NULL) return -EINVAL; /* The "num_sectors" is the number of sectors of each device that * is used. This can only make sense for arrays with redundancy. * linear and raid0 always use whatever space is available. We can only * consider changing this number if no resync or reconstruction is * happening, and if the new size is acceptable. It must fit before the * sb_start or, if that is <data_offset, it must fit before the size * of each device. If num_sectors is zero, we find the largest size * that fits.
*/ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; if (!md_is_rdwr(mddev)) return -EROFS;
/* * update_array_info is used to change the configuration of an * on-line array. * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size * fields in the info are checked against the array. * Any differences that cannot be handled will cause an error. * Normally, only one change can be managed at a time.
*/ staticint update_array_info(struct mddev *mddev, mdu_array_info_t *info)
{ int rv = 0; int cnt = 0; int state = 0;
/* calculate expected state,ignoring low bits */ if (mddev->bitmap && mddev->bitmap_info.offset)
state |= (1 << MD_SB_BITMAP_PRESENT);
if (mddev->major_version != info->major_version ||
mddev->minor_version != info->minor_version || /* mddev->patch_version != info->patch_version || */
mddev->ctime != info->ctime ||
mddev->level != info->level || /* mddev->layout != info->layout || */
mddev->persistent != !info->not_persistent ||
mddev->chunk_sectors != info->chunk_size >> 9 || /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
((state^info->state) & 0xfffffe00)
) return -EINVAL; /* Check there is only one change */ if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
cnt++; if (mddev->raid_disks != info->raid_disks)
cnt++; if (mddev->layout != info->layout)
cnt++; if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
cnt++; if (cnt == 0) return 0; if (cnt > 1) return -EINVAL;
if (mddev->layout != info->layout) { /* Change layout * we don't need to do anything at the md level, the * personality will take care of it all.
*/ if (mddev->pers->check_reshape == NULL) return -EINVAL; else {
mddev->new_layout = info->layout;
rv = mddev->pers->check_reshape(mddev); if (rv)
mddev->new_layout = mddev->layout; return rv;
}
} if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
rv = update_size(mddev, (sector_t)info->size * 2);
if (mddev->raid_disks != info->raid_disks)
rv = update_raid_disks(mddev, info->raid_disks);
if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
rv = -EINVAL; goto err;
} if (mddev->recovery || mddev->sync_thread) {
rv = -EBUSY; goto err;
} if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { /* add the bitmap */ if (mddev->bitmap) {
rv = -EEXIST; goto err;
} if (mddev->bitmap_info.default_offset == 0) {
rv = -EINVAL; goto err;
}
mddev->bitmap_info.offset =
mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
rv = mddev->bitmap_ops->create(mddev); if (!rv)
rv = mddev->bitmap_ops->load(mddev);
if (rv)
mddev->bitmap_ops->destroy(mddev);
} else { struct md_bitmap_stats stats;
rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (rv) goto err;
if (stats.file) {
rv = -EINVAL; goto err;
}
if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) {
pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
rv = -EPERM;
mddev->cluster_ops->unlock_all_bitmaps(mddev); goto err;
}
/* * We have a problem here : there is no easy way to give a CHS * virtual geometry. We currently pretend that we have a 2 heads * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-)
*/ staticint md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{ struct mddev *mddev = bdev->bd_disk->private_data;
staticinlineint md_ioctl_valid(unsignedint cmd)
{ switch (cmd) { case GET_ARRAY_INFO: case GET_DISK_INFO: case RAID_VERSION: return 0; case ADD_NEW_DISK: case GET_BITMAP_FILE: case HOT_ADD_DISK: case HOT_REMOVE_DISK: case RESTART_ARRAY_RW: case RUN_ARRAY: case SET_ARRAY_INFO: case SET_BITMAP_FILE: case SET_DISK_FAULTY: case STOP_ARRAY: case STOP_ARRAY_RO: case CLUSTERED_DISK_NACK: if (!capable(CAP_SYS_ADMIN)) return -EACCES; return 0; default: return -ENOTTY;
}
}
staticbool md_ioctl_need_suspend(unsignedint cmd)
{ switch (cmd) { case ADD_NEW_DISK: case HOT_ADD_DISK: case HOT_REMOVE_DISK: case SET_BITMAP_FILE: case SET_ARRAY_INFO: returntrue; default: returnfalse;
}
}
/* * Commands dealing with the RAID driver but not any * particular array:
*/ if (cmd == RAID_VERSION) return get_version(argp);
/* * Commands creating/starting a new array:
*/
mddev = bdev->bd_disk->private_data;
/* Some actions do not requires the mutex */ switch (cmd) { case GET_ARRAY_INFO: if (!mddev->raid_disks && !mddev->external) return -ENODEV; return get_array_info(mddev, argp);
case GET_DISK_INFO: if (!mddev->raid_disks && !mddev->external) return -ENODEV; return get_disk_info(mddev, argp);
case SET_DISK_FAULTY: return set_disk_faulty(mddev, new_decode_dev(arg));
case GET_BITMAP_FILE: return get_bitmap_file(mddev, argp);
}
if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { /* Need to flush page cache, and ensure no-one else opens * and writes
*/
err = mddev_set_closing_and_sync_blockdev(mddev, 1); if (err) return err;
}
if (!md_is_rdwr(mddev))
flush_work(&mddev->sync_work);
/* * Commands querying/configuring an existing array:
*/ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
* RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ if ((!mddev->raid_disks && !mddev->external)
&& cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
&& cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
&& cmd != GET_BITMAP_FILE) {
err = -ENODEV; goto unlock;
}
/* * Commands even a read-only array can execute:
*/ switch (cmd) { case RESTART_ARRAY_RW:
err = restart_array(mddev); goto unlock;
case STOP_ARRAY:
err = do_md_stop(mddev, 0); goto unlock;
case STOP_ARRAY_RO: if (mddev->pers)
err = md_set_readonly(mddev); goto unlock;
case HOT_REMOVE_DISK:
err = hot_remove_disk(mddev, new_decode_dev(arg)); goto unlock;
case ADD_NEW_DISK: /* We can support ADD_NEW_DISK on read-only arrays * only if we are re-adding a preexisting device. * So require mddev->pers and MD_DISK_SYNC.
*/ if (mddev->pers) {
mdu_disk_info_t info; if (copy_from_user(&info, argp, sizeof(info)))
err = -EFAULT; elseif (!(info.state & (1<<MD_DISK_SYNC))) /* Need to clear read-only for this */ break; else
err = md_add_new_disk(mddev, &info); goto unlock;
} break;
}
/* * The remaining ioctls are changing the state of the * superblock, so we do not allow them on read-only arrays.
*/ if (!md_is_rdwr(mddev) && mddev->pers) { if (mddev->ro != MD_AUTO_READ) {
err = -EROFS; goto unlock;
}
mddev->ro = MD_RDWR;
sysfs_notify_dirent_safe(mddev->sysfs_state);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); /* mddev_unlock will wake thread */ /* If a device failed while we were read-only, we * need to make sure the metadata is updated now.
*/ if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
mddev_unlock(mddev);
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_lock_nointr(mddev);
}
}
/* * Transitioning to read-auto need only happen for arrays that call * md_write_start and which are not ready for writes yet.
*/ if (!ro && mddev->ro == MD_RDONLY && mddev->pers) {
err = restart_array(mddev); if (err) goto out_unlock;
mddev->ro = MD_AUTO_READ;
}
/* * md_thread is a 'system-thread', it's priority should be very * high. We avoid resource deadlocks individually in each * raid personality. (RAID5 does preallocation) We also use RR and * the very same RT priority as kswapd, thus we will never get * into a priority inversion deadlock. * * we definitely have to have equal or higher priority than * bdflush, otherwise bdflush will deadlock if there are too * many dirty RAID5 blocks.
*/
allow_signal(SIGKILL); while (!kthread_should_stop()) {
/* We need to wait INTERRUPTIBLE so that * we don't add to the load-average. * That means we need to be sure no signals are * pending
*/ if (signal_pending(current))
flush_signals(current);
resync = mddev->curr_resync; if (resync < MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) /* Still cleaning up */
resync = max_sectors;
} elseif (resync > max_sectors) {
resync = max_sectors;
} else {
res = atomic_read(&mddev->recovery_active); /* * Resync has started, but the subtraction has overflowed or * yielded one of the special values. Force it to active to * ensure the status reports an active resync.
*/ if (resync < res || resync - res < MD_RESYNC_ACTIVE)
resync = MD_RESYNC_ACTIVE; else
resync -= res;
}
if (resync == MD_RESYNC_NONE) { if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { struct md_rdev *rdev;
WARN_ON(max_sectors == 0); /* Pick 'scale' such that (resync>>scale)*1000 will fit * in a sector_t, and (max_sectors>>scale) will fit in a * u32, as those are the requirements for sector_div. * Thus 'scale' must be at least 10
*/
scale = 10; if (sizeof(sector_t) > sizeof(unsignedlong)) { while ( max_sectors/2 > (1ULL<<(scale+32)))
scale++;
}
res = (resync>>scale)*1000;
sector_div(res, (u32)((max_sectors>>scale)+1));
per_milli = res;
{ int i, x = per_milli/50, y = 20-x;
seq_printf(seq, "["); for (i = 0; i < x; i++)
seq_printf(seq, "=");
seq_printf(seq, ">"); for (i = 0; i < y; i++)
seq_printf(seq, ".");
seq_printf(seq, "] ");
}
seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
(test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? "reshape" :
(test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? "check" :
(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"))),
per_milli/10, per_milli % 10,
(unsignedlonglong) resync/2,
(unsignedlonglong) max_sectors/2);
/* * dt: time from mark until now * db: blocks written from mark until now * rt: remaining time * * rt is a sector_t, which is always 64bit now. We are keeping * the original algorithm, but it is not really necessary. * * Original algorithm: * So we divide before multiply in case it is 32bit and close * to the limit. * We scale the divisor (db) by 32 to avoid losing precision * near the end of resync when the number of remaining sectors * is close to 'db'. * We then divide rt by 32 after multiplying by db to compensate. * The '+1' avoids division by zero if db is very small.
*/
dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++;
/* * If rdev is partition, and user doesn't issue IO to the array, the * array is still not idle if user issues IO to other partitions.
*/
rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
sectors) -
part_stat_read_accum(rdev->bdev, sectors);
/* * mddev is idle if following conditions are matched since last check: * 1) mddev doesn't have normal IO completed; * 2) mddev doesn't have inflight normal IO; * 3) if any member disk is partition, and other partitions don't have IO * completed; * * Noted this checking rely on IO accounting is enabled.
*/ staticbool is_mddev_idle(struct mddev *mddev, int init)
{ unsignedlong last_events = mddev->normal_io_events; struct gendisk *disk; struct md_rdev *rdev; bool idle = true;
disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; if (!disk) returntrue;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) if (!is_rdev_holder_idle(rdev, init))
idle = false;
rcu_read_unlock();
return idle;
}
void md_done_sync(struct mddev *mddev, int blocks, int ok)
{ /* another "blocks" (512byte) blocks have been synced */
atomic_sub(blocks, &mddev->recovery_active);
wake_up(&mddev->recovery_wait); if (!ok) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
md_wakeup_thread(mddev->thread); // stop recovery, signal do_sync ....
}
}
EXPORT_SYMBOL(md_done_sync);
/* md_write_start(mddev, bi) * If we need to update some array metadata (e.g. 'active' flag * in superblock) before writing, schedule a superblock update * and wait for it to complete. * A return value of 'false' means that the write wasn't recorded * and cannot proceed as the array is being suspend.
*/ void md_write_start(struct mddev *mddev, struct bio *bi)
{ int did_change = 0;
if (bio_data_dir(bi) != WRITE) return;
BUG_ON(mddev->ro == MD_RDONLY); if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */
mddev->ro = MD_RDWR;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread);
did_change = 1;
}
rcu_read_lock();
percpu_ref_get(&mddev->writes_pending);
smp_mb(); /* Match smp_mb in set_in_sync() */ if (mddev->safemode == 1)
mddev->safemode = 0; /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ if (mddev->in_sync || mddev->sync_checkers) {
spin_lock(&mddev->lock); if (mddev->in_sync) {
mddev->in_sync = 0;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
did_change = 1;
}
spin_unlock(&mddev->lock);
}
rcu_read_unlock(); if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state); if (!mddev->has_superblocks) return;
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
}
EXPORT_SYMBOL(md_write_start);
/* md_write_inc can only be called when md_write_start() has * already been called at least once of the current request. * It increments the counter and is useful when a single request * is split into several parts. Each part causes an increment and * so needs a matching md_write_end(). * Unlike md_write_start(), it is safe to call md_write_inc() inside * a spinlocked region.
*/ void md_write_inc(struct mddev *mddev, struct bio *bi)
{ if (bio_data_dir(bi) != WRITE) return;
WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev));
percpu_ref_get(&mddev->writes_pending);
}
EXPORT_SYMBOL(md_write_inc);
if (mddev->safemode == 2)
md_wakeup_thread(mddev->thread); elseif (mddev->safemode_delay) /* The roundup() ensures this only performs locking once * every ->safemode_delay jiffies
*/
mod_timer(&mddev->safemode_timer,
roundup(jiffies, mddev->safemode_delay) +
mddev->safemode_delay);
}
EXPORT_SYMBOL(md_write_end);
/* This is used by raid0 and raid10 */ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size)
{ struct bio *discard_bio = NULL;
if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO,
&discard_bio) || !discard_bio) return;
/* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held.
*/ void md_allow_write(struct mddev *mddev)
{ if (!mddev->pers) return; if (!md_is_rdwr(mddev)) return; if (!mddev->pers->sync_request) return;
spin_lock(&mddev->lock); if (mddev->in_sync) {
mddev->in_sync = 0;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); if (mddev->safemode_delay &&
mddev->safemode == 0)
mddev->safemode = 1;
spin_unlock(&mddev->lock);
md_update_sb(mddev, 0);
sysfs_notify_dirent_safe(mddev->sysfs_state); /* wait for the dirty state to be recorded in the metadata */
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
} else
spin_unlock(&mddev->lock);
}
EXPORT_SYMBOL_GPL(md_allow_write);
static sector_t md_sync_max_sectors(struct mddev *mddev, enum sync_action action)
{ switch (action) { case ACTION_RESYNC: case ACTION_CHECK: case ACTION_REPAIR:
atomic64_set(&mddev->resync_mismatches, 0);
fallthrough; case ACTION_RESHAPE: return mddev->resync_max_sectors; case ACTION_RECOVER: return mddev->dev_sectors; default: return 0;
}
}
switch (action) { case ACTION_CHECK: case ACTION_REPAIR: return mddev->resync_min; case ACTION_RESYNC: if (!mddev->bitmap) return mddev->resync_offset; return 0; case ACTION_RESHAPE: /* * If the original node aborts reshaping then we continue the * reshaping, so set again to avoid restart reshape from the * first beginning
*/ if (mddev_is_clustered(mddev) &&
mddev->reshape_position != MaxSector) return mddev->reshape_position; return 0; case ACTION_RECOVER:
start = MaxSector;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) if (rdev_needs_recovery(rdev, start))
start = rdev->recovery_offset;
rcu_read_unlock();
/* If there is a bitmap, we need to make sure all * writes that started before we added a spare * complete before we start doing a recovery. * Otherwise the write might complete and (via * bitmap_endwrite) set a bit in the bitmap after the * recovery has checked that bit and skipped that * region.
*/ if (mddev->bitmap) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
} return start; default: return MaxSector;
}
}
staticbool sync_io_within_limit(struct mddev *mddev)
{ int io_sectors;
/* * For raid456, sync IO is stripe(4k) per IO, for other levels, it's * RESYNC_PAGES(64k) per IO.
*/ if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
io_sectors = 8; else
io_sectors = 128;
/* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return;
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip;
if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
!md_is_rdwr(mddev)) {/* never try to sync a read-only array */
set_bit(MD_RECOVERY_INTR, &mddev->recovery); goto skip;
}
if (mddev_is_clustered(mddev)) {
ret = mddev->cluster_ops->resync_start(mddev); if (ret) goto skip;
/* * Before starting a resync we must have set curr_resync to * 2, and then checked that every "conflicting" array has curr_resync * less than ours. When we find one that is the same or higher * we wait on resync_wait. To avoid deadlock, we reduce curr_resync * to 1 if we choose to yield (based arbitrarily on address of mddev structure). * This will mean we have to start checking from the beginning again. *
*/ if (mddev_is_clustered(mddev))
mddev->cluster_ops->resync_start_notify(mddev); do { int mddev2_minor = -1;
mddev->curr_resync = MD_RESYNC_DELAYED;
try_again: if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip;
spin_lock(&all_mddevs_lock);
list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { if (test_bit(MD_DELETED, &mddev2->flags)) continue; if (mddev2 == mddev) continue; if (!mddev->parallel_resync
&& mddev2->curr_resync
&& match_mddev_units(mddev, mddev2)) {
DEFINE_WAIT(wq); if (mddev < mddev2 &&
mddev->curr_resync == MD_RESYNC_DELAYED) { /* arbitrarily yield */
mddev->curr_resync = MD_RESYNC_YIELDED;
wake_up(&resync_wait);
} if (mddev > mddev2 &&
mddev->curr_resync == MD_RESYNC_YIELDED) /* no need to wait here, we can wait the next * time 'round when curr_resync == 2
*/ continue; /* We need to wait 'interruptible' so as not to * contribute to the load average, and not to * be caught by 'softlockup'
*/
prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev2->curr_resync >= mddev->curr_resync) { if (mddev2_minor != mddev2->md_minor) {
mddev2_minor = mddev2->md_minor;
pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
desc, mdname(mddev),
mdname(mddev2));
}
spin_unlock(&all_mddevs_lock);
if (signal_pending(current))
flush_signals(current);
schedule();
finish_wait(&resync_wait, &wq); goto try_again;
}
finish_wait(&resync_wait, &wq);
}
}
spin_unlock(&all_mddevs_lock);
} while (mddev->curr_resync < MD_RESYNC_DELAYED);
pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
speed_max(mddev), desc);
is_mddev_idle(mddev, 1); /* this initializes IO event counters */
/* * Tune reconstruction:
*/
window = 32 * (PAGE_SIZE / 512);
pr_debug("md: using %dk window, over a total of %lluk.\n",
window/2, (unsignedlonglong)max_sectors/2);
while (j >= mddev->resync_max &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* As this condition is controlled by user-space, * we can block indefinitely, so use '_interruptible' * to avoid triggering warnings.
*/
flush_signals(current); /* just in case */
wait_event_interruptible(mddev->recovery_wait,
mddev->resync_max > j
|| test_bit(MD_RECOVERY_INTR,
&mddev->recovery));
}
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break;
if (!skipped) { /* actual IO requested */
io_sectors += sectors;
atomic_add(sectors, &mddev->recovery_active);
}
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break;
j += sectors; if (j > max_sectors) /* when skipping, extra large numbers can be returned. */
j = max_sectors; if (j >= MD_RESYNC_ACTIVE)
mddev->curr_resync = j;
mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be * visible in /proc/mdstat
*/
md_new_event();
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break;
/* * this loop exits only if either when we are slower than * the 'hard' speed limit, or the system was IO-idle for * a jiffy. * the system might be non-idle CPU-wise, but we only care * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast)
*/
cond_resched();
if (currspeed > speed_min(mddev)) { if (currspeed > speed_max(mddev)) {
msleep(500); goto repeat;
} if (!sync_io_within_limit(mddev) &&
!is_mddev_idle(mddev, 0)) { /* * Give other IO more of a chance. * The faster the devices, the less we wait.
*/
wait_event(mddev->recovery_wait,
!atomic_read(&mddev->recovery_active));
}
}
}
pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
test_bit(MD_RECOVERY_INTR, &mddev->recovery)
? "interrupted" : "done"); /* * this also signals 'finished resyncing' to md_stop
*/
blk_finish_plug(&plug);
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->resync_offset) {
pr_debug("md: checkpointing %s of %s.\n",
desc, mdname(mddev)); if (test_bit(MD_RECOVERY_ERROR,
&mddev->recovery))
mddev->resync_offset =
mddev->curr_resync_completed; else
mddev->resync_offset =
mddev->curr_resync;
}
} else
mddev->resync_offset = MaxSector;
} else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector; if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) if (mddev->delta_disks >= 0 &&
rdev_needs_recovery(rdev, mddev->curr_resync))
rdev->recovery_offset = mddev->curr_resync;
rcu_read_unlock();
}
}
}
skip: /* set CHANGE_PENDING here since maybe another update is needed, * so other nodes are informed. It should be harmless for normal
* raid */
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
staticbool rdev_removeable(struct md_rdev *rdev)
{ /* rdev is not used. */ if (rdev->raid_disk < 0) returnfalse;
/* There are still inflight io, don't remove this rdev. */ if (atomic_read(&rdev->nr_pending)) returnfalse;
/* * An error occurred but has not yet been acknowledged by the metadata * handler, don't remove this rdev.
*/ if (test_bit(Blocked, &rdev->flags)) returnfalse;
/* Fautly rdev is not used, it's safe to remove it. */ if (test_bit(Faulty, &rdev->flags)) returntrue;
/* Journal disk can only be removed if it's faulty. */ if (test_bit(Journal, &rdev->flags)) returnfalse;
/* * 'In_sync' is cleared while 'raid_disk' is valid, which means * replacement has just become active from pers->spare_active(), and * then pers->hot_remove_disk() will replace this rdev with replacement.
*/ if (!test_bit(In_sync, &rdev->flags)) returntrue;
mddev = READ_ONCE(rdev->mddev); if (!mddev) returnfalse;
/* rdev is already used, don't add it again. */ if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
test_bit(Faulty, &rdev->flags)) returnfalse;
/* Allow to add journal disk. */ if (test_bit(Journal, &rdev->flags)) returntrue;
/* Allow to add if array is read-write. */ if (md_is_rdwr(mddev)) returntrue;
/* * For read-only array, only allow to readd a rdev. And if bitmap is * used, don't allow to readd a rdev that is too old.
*/ if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) returntrue;
if (removed && mddev->kobj.sd)
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
return removed;
}
staticint remove_and_add_spares(struct mddev *mddev, struct md_rdev *this)
{ struct md_rdev *rdev; int spares = 0; int removed = 0;
if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) /* Mustn't remove devices when resync thread is running */ return 0;
removed = remove_spares(mddev, this); if (this && removed) goto no_add;
rdev_for_each(rdev, mddev) { if (this && this != rdev) continue; if (rdev_is_spare(rdev))
spares++; if (!rdev_addable(rdev)) continue; if (!test_bit(Journal, &rdev->flags))
rdev->recovery_offset = 0; if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { /* failure here is OK */
sysfs_link_rdev(mddev, rdev); if (!test_bit(Journal, &rdev->flags))
spares++;
md_new_event();
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
}
}
no_add: if (removed)
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return spares;
}
staticbool md_choose_sync_action(struct mddev *mddev, int *spares)
{ /* Check if reshape is in progress first. */ if (mddev->reshape_position != MaxSector) { if (mddev->pers->check_reshape == NULL ||
mddev->pers->check_reshape(mddev) != 0) returnfalse;
/* Check if resync is in progress. */ if (mddev->resync_offset < MaxSector) {
remove_spares(mddev, NULL);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); returntrue;
}
/* * Remove any failed drives, then add spares if possible. Spares are * also removed and re-added, to allow the personality to fail the * re-add.
*/
*spares = remove_and_add_spares(mddev, NULL); if (*spares) {
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
/* Start new recovery. */
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); returntrue;
}
/* Delay to choose resync/check/repair in md_do_sync(). */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) returntrue;
/* * If reshape is still in progress, spares won't be added or removed * from conf until reshape is done.
*/ if (mddev->reshape_position == MaxSector &&
md_spares_need_change(mddev)) {
suspend = true;
mddev_suspend(mddev, false);
}
mddev_lock_nointr(mddev); if (!md_is_rdwr(mddev)) { /* * On a read-only array we can: * - remove failed devices * - add already-in_sync devices if the array itself is in-sync. * As we only add devices that are already in-sync, we can * activate the spares immediately.
*/
remove_and_add_spares(mddev, NULL); goto not_running;
}
if (!md_choose_sync_action(mddev, &spares)) goto not_running;
if (!mddev->pers->sync_request) goto not_running;
/* * We are adding a device or devices to an array which has the bitmap * stored on all devices. So make sure all bitmap pages get written.
*/ if (spares)
mddev->bitmap_ops->write_all(mddev);
name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? "reshape" : "resync";
rcu_assign_pointer(mddev->sync_thread,
md_register_thread(md_do_sync, mddev, name)); if (!mddev->sync_thread) {
pr_warn("%s: could not start resync thread...\n",
mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ goto not_running;
}
mddev_unlock(mddev); /* * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should * not set it again. Otherwise, we may cause issue like this one: * https://bugzilla.kernel.org/show_bug.cgi?id=218200 * Therefore, use __mddev_resume(mddev, false).
*/ if (suspend)
__mddev_resume(mddev, false);
md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event(); return;
not_running:
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev_unlock(mddev); /* * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should * not set it again. Otherwise, we may cause issue like this one: * https://bugzilla.kernel.org/show_bug.cgi?id=218200 * Therefore, use __mddev_resume(mddev, false).
*/ if (suspend)
__mddev_resume(mddev, false);
wake_up(&resync_wait); if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
mddev->sysfs_action)
sysfs_notify_dirent_safe(mddev->sysfs_action);
}
staticvoid unregister_sync_thread(struct mddev *mddev)
{ if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { /* resync/recovery still happening */
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); return;
}
if (WARN_ON_ONCE(!mddev->sync_thread)) return;
md_reap_sync_thread(mddev);
}
/* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. * Raid personalities that don't have a thread (linear/raid0) do not * need this as they never do any recovery or update the superblock. * * It does not do any resync itself, but rather "forks" off other threads * to do that as needed. * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in * "->recovery" and create a thread at ->sync_thread. * When the thread finishes it sets MD_RECOVERY_DONE * and wakeups up this thread which will reap the thread and finish up. * This thread also removes any faulty devices (with nr_pending == 0). * * The overall approach is: * 1/ if the superblock needs updating, update it. * 2/ If a recovery thread is running, don't do anything else. * 3/ If recovery has finished, clean up, possibly marking spares active. * 4/ If there are any faulty devices, remove them. * 5/ If array is degraded, try to add spares devices * 6/ If array has spares or is not in-sync, start a resync thread.
*/ void md_check_recovery(struct mddev *mddev)
{ if (mddev->bitmap)
mddev->bitmap_ops->daemon_work(mddev);
if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) {
pr_debug("md: %s in immediate safe mode\n",
mdname(mddev));
mddev->safemode = 2;
}
flush_signals(current);
}
if (mddev_trylock(mddev)) { bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1)
mddev->safemode = 0;
if (!md_is_rdwr(mddev)) { struct md_rdev *rdev;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
unregister_sync_thread(mddev); goto unlock;
}
if (!mddev->external && mddev->in_sync) /* * 'Blocked' flag not needed as failed devices * will be recorded if array switched to read/write. * Leaving it set will prevent the device * from being removed.
*/
rdev_for_each(rdev, mddev)
clear_bit(Blocked, &rdev->flags);
/* * There is no thread, but we need to call * ->spare_active and clear saved_raid_disk
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
/* * Let md_start_sync() to remove and add rdevs to the * array.
*/ if (md_spares_need_change(mddev)) {
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
queue_work(md_misc_wq, &mddev->sync_work);
}
/* * Never start a new sync thread if MD_RECOVERY_RUNNING is * still set.
*/ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
unregister_sync_thread(mddev); goto unlock;
}
/* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action".
*/
mddev->curr_resync_completed = 0;
spin_lock(&mddev->lock);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
spin_unlock(&mddev->lock); /* Clear some bits that don't mean anything, but * might be left set
*/
clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
/* resync has finished, collect result */
md_unregister_thread(mddev, &mddev->sync_thread);
atomic_inc(&mddev->sync_seq);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
mddev->degraded != mddev->raid_disks) { /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) {
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
}
} if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
mddev->pers->finish_reshape) {
mddev->pers->finish_reshape(mddev); if (mddev_is_clustered(mddev))
is_reshaped = true;
}
/* If array is no-longer degraded, then any saved_raid_disk * information must be scrapped.
*/ if (!mddev->degraded)
rdev_for_each(rdev, mddev)
rdev->saved_raid_disk = -1;
md_update_sb(mddev, 1); /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
* clustered raid */ if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
mddev->cluster_ops->resync_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); /* * We call mddev->cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, * so it is time to update size across cluster.
*/ if (mddev_is_clustered(mddev) && is_reshaped
&& !test_bit(MD_CLOSING, &mddev->flags))
mddev->cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event(); if (mddev->event_work.func)
queue_work(md_misc_wq, &mddev->event_work);
wake_up(&resync_wait);
}
EXPORT_SYMBOL(md_reap_sync_thread);
/* Returns true on success, false on failure */ bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new)
{ struct mddev *mddev = rdev->mddev;
/* * Recording new badblocks for faulty rdev will force unnecessary * super block updating. This is fragile for external management because * userspace daemon may trying to remove this device and deadlock may * occur. This will be probably solved in the mdadm, but it is safer to * avoid it.
*/ if (test_bit(Faulty, &rdev->flags)) returntrue;
if (is_new)
s += rdev->new_data_offset; else
s += rdev->data_offset;
if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) returnfalse;
/* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags))
sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
md_wakeup_thread(rdev->mddev->thread); returntrue;
}
EXPORT_SYMBOL_GPL(rdev_set_badblocks);
void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new)
{ if (is_new)
s += rdev->new_data_offset; else
s += rdev->data_offset;
if (!badblocks_clear(&rdev->badblocks, s, sectors)) return;
if (test_bit(ExternalBbl, &rdev->flags))
sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
spin_lock(&all_mddevs_lock);
list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue;
spin_unlock(&all_mddevs_lock); if (mddev_trylock(mddev)) { if (mddev->pers)
__md_stop_writes(mddev); if (mddev->persistent)
mddev->safemode = 2;
mddev_unlock(mddev);
}
need_delay = 1;
spin_lock(&all_mddevs_lock);
mddev_put_locked(mddev);
}
spin_unlock(&all_mddevs_lock);
/* * certain more exotic SCSI devices are known to be * volatile wrt too early system reboots. While the * right place to handle this issue is the given * driver, we do want to have a safe RAID driver ...
*/ if (need_delay)
msleep(1000);
return NOTIFY_DONE;
}
staticstruct notifier_block md_notifier = {
.notifier_call = md_notify_reboot,
.next = NULL,
.priority = INT_MAX, /* before any real devices */
};
/* * If size is changed in another node then we need to * do resize as well.
*/ if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); if (ret)
pr_info("md-cluster: resize failed\n"); else
mddev->bitmap_ops->update_sb(mddev->bitmap);
}
/* Check for change of roles in the active devices */
rdev_for_each_safe(rdev2, tmp, mddev) { if (test_bit(Faulty, &rdev2->flags)) { if (test_bit(ClusterRemove, &rdev2->flags))
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); continue;
}
/* Check if the roles changed */
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
if (test_bit(Candidate, &rdev2->flags)) { if (role == MD_DISK_ROLE_FAULTY) {
pr_info("md: Removing Candidate device %pg because add failed\n",
rdev2->bdev);
md_kick_rdev_from_array(rdev2); continue;
} else
clear_bit(Candidate, &rdev2->flags);
}
if (role != rdev2->raid_disk) { /* * got activated except reshape is happening.
*/ if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE) &&
!mddev->cluster_ops->resync_status_get(mddev)) { /* * -1 to make raid1_add_disk() set conf->fullsync * to 1. This could avoid skipping sync when the * remote node is down during resyncing.
*/ if ((le32_to_cpu(sb->feature_map)
& MD_FEATURE_RECOVERY_OFFSET))
rdev2->saved_raid_disk = -1; else
rdev2->saved_raid_disk = role;
ret = remove_and_add_spares(mddev, rdev2);
pr_info("Activated spare: %pg\n",
rdev2->bdev); /* wakeup mddev->thread here, so array could
* perform resync with the new activated disk */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
} /* device faulty * We just want to do the minimum to mark the disk * as faulty. The recovery is performed by the * one who initiated the error.
*/ if (role == MD_DISK_ROLE_FAULTY ||
role == MD_DISK_ROLE_JOURNAL) {
md_error(mddev, rdev2);
clear_bit(Blocked, &rdev2->flags);
}
}
}
if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) {
ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); if (ret)
pr_warn("md: updating array disks failed. %d\n", ret);
}
/* * Since mddev->delta_disks has already updated in update_raid_disks, * so it is time to check reshape.
*/ if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { /* * reshape is happening in the remote node, we need to * update reshape_position and call start_reshape.
*/
mddev->reshape_position = le64_to_cpu(sb->reshape_position); if (mddev->pers->update_reshape_pos)
mddev->pers->update_reshape_pos(mddev); if (mddev->pers->start_reshape)
mddev->pers->start_reshape(mddev);
} elseif (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
mddev->reshape_position != MaxSector &&
!(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { /* reshape is just done in another node. */
mddev->reshape_position = MaxSector; if (mddev->pers->update_reshape_pos)
mddev->pers->update_reshape_pos(mddev);
}
/* Finally set the event to be up to date */
mddev->events = le64_to_cpu(sb->events);
}
/* Store the sb page of the rdev in the swapout temporary * variable in case we err in the future
*/
rdev->sb_page = NULL;
err = alloc_disk_sb(rdev); if (err == 0) {
ClearPageUptodate(rdev->sb_page);
rdev->sb_loaded = 0;
err = super_types[mddev->major_version].
load_super(rdev, NULL, mddev->minor_version);
} if (err < 0) {
pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
__func__, __LINE__, rdev->desc_nr, err); if (rdev->sb_page)
put_page(rdev->sb_page);
rdev->sb_page = swapout;
rdev->sb_loaded = 1; return err;
}
sb = page_address(rdev->sb_page); /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET * is not set
*/
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
/* The other node finished recovery, call spare_active to set * device In_sync and mddev->degraded
*/ if (rdev->recovery_offset == MaxSector &&
!test_bit(In_sync, &rdev->flags) &&
mddev->pers->spare_active(mddev))
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
put_page(swapout); return 0;
}
void md_reload_sb(struct mddev *mddev, int nr)
{ struct md_rdev *rdev = NULL, *iter; int err;
/* Find the rdev */
rdev_for_each_rcu(iter, mddev) { if (iter->desc_nr == nr) {
rdev = iter; break;
}
}
if (!rdev) {
pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); return;
}
err = read_rdev(mddev, rdev); if (err < 0) return;
check_sb_changes(mddev, rdev);
/* Read all rdev's to update recovery_offset */
rdev_for_each_rcu(rdev, mddev) { if (!test_bit(Faulty, &rdev->flags))
read_rdev(mddev, rdev);
}
}
EXPORT_SYMBOL(md_reload_sb);
#ifndef MODULE
/* * Searches all registered partitions for autorun RAID arrays * at boot time.
*/
/* We cannot unload the modules while some process is * waiting for us in select() or poll() - wake them up
*/
md_unloading = 1; while (waitqueue_active(&md_event_waiters)) { /* not safe to leave yet */
wake_up(&md_event_waiters);
msleep(delay);
delay += delay;
}
remove_proc_entry("mdstat", NULL);
spin_lock(&all_mddevs_lock);
list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue;
spin_unlock(&all_mddevs_lock);
export_array(mddev);
mddev->ctime = 0;
mddev->hold_active = 0; /* * As the mddev is now fully clear, mddev_put will schedule * the mddev for destruction by a workqueue, and the * destroy_workqueue() below will wait for that to complete.
*/
spin_lock(&all_mddevs_lock);
mddev_put_locked(mddev);
}
spin_unlock(&all_mddevs_lock);
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.234Bemerkung:
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.