#define BITMAP_MAJOR_LO 3 /* version 4 insists the bitmap is in little-endian order * with version 3, it is host-endian which is non-portable * Version 5 is currently set only for clustered devices
*/ #define BITMAP_MAJOR_HI 4 #define BITMAP_MAJOR_CLUSTERED 5 #define BITMAP_MAJOR_HOSTENDIAN 3
/* * in-memory bitmap: * * Use 16 bit block counters to track pending writes to each "chunk". * The 2 high order bits are special-purpose, the first is a flag indicating * whether a resync is needed. The second is a flag indicating whether a * resync is active. * This means that the counter is actually 14 bits: * * +--------+--------+------------------------------------------------+ * | resync | resync | counter | * | needed | active | | * | (0-1) | (0-1) | (0-16383) | * +--------+--------+------------------------------------------------+ * * The "resync needed" bit is set when: * a '1' bit is read from storage at startup. * a write request fails on some drives * a resync is aborted on a chunk with 'resync active' set * It is cleared (and resync-active set) when a resync starts across all drives * of the chunk. * * * The "resync active" bit is set when: * a resync is started on all drives, and resync_needed is set. * resync_needed will be cleared (as long as resync_active wasn't already set). * It is cleared when a resync completes. * * The counter counts pending write requests, plus the on-disk bit. * When the counter is '1' and the resync bits are clear, the on-disk * bit can be cleared as well, thus setting the counter to 0. * When we set a bit, or in the counter (to start a write), if the fields is * 0, we first set the disk bit and set the counter to 1. * * If the counter is 0, the on-disk bit is clear and the stripe is clean * Anything that dirties the stripe pushes the counter to 2 (at least) * and sets the on-disk bit (lazily). * If a periodic sweep find the counter at 2, it is decremented to 1. * If the sweep find the counter at 1, the on-disk bit is cleared and the * counter goes to zero. * * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block * counters as a fallback when "page" memory cannot be allocated: * * Normal case (page memory allocated): * * page pointer (32-bit) * * [ ] ------+ * | * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) * c1 c2 c2048 * * Hijacked case (page memory allocation failed): * * hijacked page pointer (32-bit) * * [ ][ ] (no page memory allocated) * counter #1 (16-bit) counter #2 (16-bit) *
*/
/* how many counters per page? */ #define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) /* same, except a shift value for more efficient bitops */ #define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) /* same, except a mask value for more efficient bitops */ #define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
#define BITMAP_BLOCK_SHIFT 9
/* * bitmap structures:
*/
/* the in-memory bitmap is represented by bitmap_pages */ struct bitmap_page { /* * map points to the actual memory page
*/ char *map; /* * in emergencies (when map cannot be alloced), hijack the map * pointer and use it as two counters itself
*/ unsignedint hijacked:1; /* * If any counter in this page is '1' or '2' - and so could be * cleared then that page is marked as 'pending'
*/ unsignedint pending:1; /* * count of dirty bits on the page
*/ unsignedint count:30;
};
/* the main bitmap structure - one per mddev */ struct bitmap {
struct bitmap_counts {
spinlock_t lock; struct bitmap_page *bp; /* total number of pages in the bitmap */ unsignedlong pages; /* number of pages not yet allocated */ unsignedlong missing_pages; /* chunksize = 2^chunkshift (for bitops) */ unsignedlong chunkshift; /* total number of data chunks for the array */ unsignedlong chunks;
} counts;
struct mddev *mddev; /* the md device that the bitmap is for */
__u64 events_cleared; int need_sync;
struct bitmap_storage { /* backing disk file */ struct file *file; /* cached copy of the bitmap file superblock */ struct page *sb_page; unsignedlong sb_index; /* list of cache pages for the file */ struct page **filemap; /* attributes associated filemap pages */ unsignedlong *filemap_attr; /* number of pages in the file */ unsignedlong file_pages; /* total bytes in the bitmap */ unsignedlong bytes;
} storage;
unsignedlong flags;
int allclean;
atomic_t behind_writes; /* highest actual value at runtime */ unsignedlong behind_writes_used;
/* * the bitmap daemon - periodically wakes up and sweeps the bitmap * file, cleaning up bits and flushing out pages to disk as necessary
*/ unsignedlong daemon_lastrun; /* jiffies of last run */ /* * when we lasted called end_sync to update bitmap with resync * progress.
*/ unsignedlong last_end_sync;
/* pending writes to the bitmap file */
atomic_t pending_writes;
wait_queue_head_t write_wait;
wait_queue_head_t overflow_wait;
wait_queue_head_t behind_wait;
struct kernfs_node *sysfs_can_clear; /* slot offset for clustered env */ int cluster_slot;
};
staticint __bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, bool init);
/* * check a page and, if necessary, allocate it (or hijack it if the alloc fails) * * 1) check to see if this page is allocated, if it's not then try to alloc * 2) if the alloc fails, set the page's hijacked flag so we'll use the * page pointer directly as a counter * * if we find our page, we increment the page's refcount so that it stays * allocated while we're using it
*/ staticint md_bitmap_checkpage(struct bitmap_counts *bitmap, unsignedlong page, int create, int no_hijack)
__releases(bitmap->lock)
__acquires(bitmap->lock)
{ unsignedchar *mappage;
WARN_ON_ONCE(page >= bitmap->pages); if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ return 0;
if (bitmap->bp[page].map) /* page is already allocated, just return */ return 0;
if (!create) return -ENOENT;
/* this page has not been allocated yet */
spin_unlock_irq(&bitmap->lock); /* It is possible that this is being called inside a * prepare_to_wait/finish_wait loop from raid5c:make_request(). * In general it is not permitted to sleep in that context as it * can cause the loop to spin freely. * That doesn't apply here as we can only reach this point * once with any loop. * When this function completes, either bp[page].map or * bp[page].hijacked. In either case, this function will * abort before getting to this point again. So there is * no risk of a free-spin, and so it is safe to assert * that sleeping here is allowed.
*/
sched_annotate_sleep();
mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
spin_lock_irq(&bitmap->lock);
if (mappage == NULL) {
pr_debug("md/bitmap: map page allocation failed, hijacking\n"); /* We don't support hijack for cluster raid */ if (no_hijack) return -ENOMEM; /* failed - set the hijacked flag so that we can use the
* pointer as a counter */ if (!bitmap->bp[page].map)
bitmap->bp[page].hijacked = 1;
} elseif (bitmap->bp[page].map ||
bitmap->bp[page].hijacked) { /* somebody beat us to getting the page */
kfree(mappage);
} else {
/* no page was in place and we have one, so install it */
/* if page is completely empty, put it back on the free list, or dealloc it */ /* if page was hijacked, unmark the flag so it might get alloced next time */ /* Note: lock should be held when calling this */ staticvoid md_bitmap_checkfree(struct bitmap_counts *bitmap, unsignedlong page)
{ char *ptr;
if (bitmap->bp[page].count) /* page is still busy */ return;
/* page is no longer in use, it can be released */
if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */
bitmap->bp[page].hijacked = 0;
bitmap->bp[page].map = NULL;
} else { /* normal case, free the page */
ptr = bitmap->bp[page].map;
bitmap->bp[page].map = NULL;
bitmap->missing_pages++;
kfree(ptr);
}
}
/* * bitmap file handling - read and write the bitmap file and its superblock
*/
/* * basic page I/O operations
*/
/* IO operations when bitmap is stored near all superblocks */
/* choose a good rdev and read the page from there */ staticint read_sb_page(struct mddev *mddev, loff_t offset, struct page *page, unsignedlong index, int size)
{
staticstruct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev)
{ /* Iterate the disks of an mddev, using rcu to protect access to the * linked list, and raising the refcount of devices we return to ensure * they don't disappear while in use. * As devices are only added or removed when raid_disk is < 0 and * nr_pending is 0 and In_sync is clear, the entries we return will * still be in the same position on the list when we re-enter * list_for_each_entry_continue_rcu. * * Note that if entered with 'rdev == NULL' to start at the * beginning, we temporarily assign 'rdev' to an address which * isn't really an rdev, but which can be used by * list_for_each_entry_continue_rcu() to find the first entry.
*/
rcu_read_lock(); if (rdev == NULL) /* start at the beginning */
rdev = list_entry(&mddev->disks, struct md_rdev, same_set); else { /* release the previous rdev and start from there. */
rdev_dec_pending(rdev, mddev);
}
list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags)) { /* this is a usable devices */
atomic_inc(&rdev->nr_pending);
rcu_read_unlock(); return rdev;
}
}
rcu_read_unlock(); return NULL;
}
/* read a page from a file. * We both read the page, and attach buffers to the page to record the * address of each block (using bmap). These addresses will be used * to write the block later, completely bypassing the filesystem. * This usage is similar to how swap files are handled, and allows us * to write to a file with no concerns of memory allocation failing.
*/ staticint read_file_page(struct file *file, unsignedlong index, struct bitmap *bitmap, unsignedlong count, struct page *page)
{ int ret = 0; struct inode *inode = file_inode(file); struct buffer_head *bh;
sector_t block, blk_cur; unsignedlong blocksize = i_blocksize(inode);
/* * md_bitmap_wait_writes() should be called before writing any bitmap * blocks, to ensure previous writes, particularly from * md_bitmap_daemon_work(), have completed.
*/ staticvoid md_bitmap_wait_writes(struct bitmap *bitmap)
{ if (bitmap->storage.file)
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0); else /* Note that we ignore the return value. The writes * might have failed, but that would just mean that * some bits which should be cleared haven't been, * which is safe. The relevant bitmap blocks will * probably get written again, but there is no great * loss if they aren't.
*/
md_super_wait(bitmap->mddev);
}
/* update the event counter and sync the superblock to disk */ staticvoid bitmap_update_sb(void *data)
{
bitmap_super_t *sb; struct bitmap *bitmap = data;
if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ return; if (bitmap->mddev->bitmap_info.external) return; if (!bitmap->storage.sb_page) /* no superblock */ return;
sb = kmap_local_page(bitmap->storage.sb_page);
sb->events = cpu_to_le64(bitmap->mddev->events); if (bitmap->mddev->events < bitmap->events_cleared) /* rocking back to read-only */
bitmap->events_cleared = bitmap->mddev->events;
sb->events_cleared = cpu_to_le64(bitmap->events_cleared); /* * clear BITMAP_WRITE_ERROR bit to protect against the case that * a bitmap write error occurred but the later writes succeeded.
*/
sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR)); /* Just in case these have been changed via sysfs: */
sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); /* This might have been changed by a reshape */
sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
bitmap_info.space);
kunmap_local(sb);
/* * bitmap_new_disk_sb * @bitmap * * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb * reads and verifies the on-disk bitmap superblock and populates bitmap_info. * This function verifies 'bitmap_info' and populates the on-disk bitmap * structure, which is to be written to disk. * * Returns: 0 on success, -Exxx on error
*/ staticint md_bitmap_new_disk_sb(struct bitmap *bitmap)
{
bitmap_super_t *sb; unsignedlong chunksize, daemon_sleep, write_behind;
chunksize = bitmap->mddev->bitmap_info.chunksize;
BUG_ON(!chunksize); if (!is_power_of_2(chunksize)) {
kunmap_local(sb);
pr_warn("bitmap chunksize not a power of 2\n"); return -EINVAL;
}
sb->chunksize = cpu_to_le32(chunksize);
/* * FIXME: write_behind for RAID1. If not specified, what * is a good choice? We choose COUNTER_MAX / 2 arbitrarily.
*/
write_behind = bitmap->mddev->bitmap_info.max_write_behind; if (write_behind > COUNTER_MAX / 2)
write_behind = COUNTER_MAX / 2;
sb->write_behind = cpu_to_le32(write_behind);
bitmap->mddev->bitmap_info.max_write_behind = write_behind;
/* keep the array size field of the bitmap superblock up to date */
sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
/* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
reason = "bad magic"; elseif (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
reason = "unrecognized superblock version"; elseif (chunksize < 512)
reason = "bitmap chunksize too small"; elseif (!is_power_of_2(chunksize))
reason = "bitmap chunksize not a power of 2"; elseif (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT)
reason = "daemon sleep period out of range"; elseif (write_behind > COUNTER_MAX)
reason = "write-behind limit out of range (0 - 16383)"; if (reason) {
pr_warn("%s: invalid bitmap file superblock: %s\n",
bmname(bitmap), reason); goto out;
}
/* * Setup nodes/clustername only if bitmap version is * cluster-compatible
*/ if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
nodes = le32_to_cpu(sb->nodes);
strscpy(bitmap->mddev->bitmap_info.cluster_name,
sb->cluster_name, 64);
}
/* keep the array size field of the bitmap superblock up to date */
sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
if (bitmap->mddev->persistent) { /* * We have a persistent array superblock, so compare the * bitmap's UUID and event counter to the mddev's
*/ if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
pr_warn("%s: bitmap superblock UUID mismatch\n",
bmname(bitmap)); goto out;
}
events = le64_to_cpu(sb->events); if (!nodes && (events < bitmap->mddev->events)) {
pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n",
bmname(bitmap), events,
(unsignedlonglong) bitmap->mddev->events);
set_bit(BITMAP_STALE, &bitmap->flags);
}
}
/* assign fields using values from superblock */
bitmap->flags |= le32_to_cpu(sb->state); if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
err = 0;
out:
kunmap_local(sb); if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { /* Assigning chunksize is required for "re_read" */
bitmap->mddev->bitmap_info.chunksize = chunksize;
err = md_setup_cluster(bitmap->mddev, nodes); if (err) {
pr_warn("%s: Could not setup cluster service (%d)\n",
bmname(bitmap), err); goto out_no_sb;
}
bitmap->cluster_slot = bitmap->mddev->cluster_ops->slot_number(bitmap->mddev); goto re_read;
}
/* * on-disk bitmap: * * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap * file a page at a time. There's a superblock at the start of the file.
*/ /* calculate the index of the page that contains this bit */ staticinlineunsignedlong file_page_index(struct bitmap_storage *store, unsignedlong chunk)
{ if (store->sb_page)
chunk += sizeof(bitmap_super_t) << 3; return chunk >> PAGE_BIT_SHIFT;
}
/* calculate the (bit) offset of this bit within a page */ staticinlineunsignedlong file_page_offset(struct bitmap_storage *store, unsignedlong chunk)
{ if (store->sb_page)
chunk += sizeof(bitmap_super_t) << 3; return chunk & (PAGE_BITS - 1);
}
/* * return a pointer to the page in the filemap that contains the given bit *
*/ staticinlinestruct page *filemap_get_page(struct bitmap_storage *store, unsignedlong chunk)
{ if (file_page_index(store, chunk) >= store->file_pages) return NULL; return store->filemap[file_page_index(store, chunk)];
}
staticint md_bitmap_storage_alloc(struct bitmap_storage *store, unsignedlong chunks, int with_super, int slot_number)
{ int pnum, offset = 0; unsignedlong num_pages; unsignedlong bytes;
bytes = DIV_ROUND_UP(chunks, 8); if (with_super)
bytes += sizeof(bitmap_super_t);
/* We need 4 bits per page, rounded up to a multiple
* of sizeof(unsigned long) */
store->filemap_attr = kzalloc(
roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsignedlong)),
GFP_KERNEL); if (!store->filemap_attr) return -ENOMEM;
/* * bitmap_file_kick - if an error occurs while manipulating the bitmap file * then it is no longer reliable, so we stop using it and we mark the file * as failed in the superblock
*/ staticvoid md_bitmap_file_kick(struct bitmap *bitmap)
{ if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
bitmap_update_sb(bitmap);
if (bitmap->storage.file) {
pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
bmname(bitmap), bitmap->storage.file);
} else
pr_warn("%s: disabling internal bitmap due to errors\n",
bmname(bitmap));
}
}
enum bitmap_page_attr {
BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */
BITMAP_PAGE_PENDING = 1, /* there are bits that are being cleaned.
* i.e. counter is 1 or 2. */
BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
};
staticinlineint test_and_clear_page_attr(struct bitmap *bitmap, int pnum, enum bitmap_page_attr attr)
{ return test_and_clear_bit((pnum<<2) + attr,
bitmap->storage.filemap_attr);
} /* * bitmap_file_set_bit -- called before performing a write to the md device * to set (and eventually sync) a particular bit in the bitmap file * * we set the bit immediately, then we record the page number so that * when an unplug occurs, we can flush the dirty pages out to disk
*/ staticvoid md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
{ unsignedlong bit; struct page *page; void *kaddr; unsignedlong chunk = block >> bitmap->counts.chunkshift; struct bitmap_storage *store = &bitmap->storage; unsignedlong index = file_page_index(store, chunk); unsignedlong node_offset = 0;
index += store->sb_index; if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages;
page = filemap_get_page(&bitmap->storage, chunk); if (!page) return;
bit = file_page_offset(&bitmap->storage, chunk);
/* set the bit */
kaddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
set_bit(bit, kaddr); else
set_bit_le(bit, kaddr);
kunmap_local(kaddr);
pr_debug("set file bit %lu page %lu\n", bit, index); /* record page number so it gets flushed to disk when unplug occurs */
set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
}
page = filemap_get_page(&bitmap->storage, chunk); if (!page) return -EINVAL;
bit = file_page_offset(&bitmap->storage, chunk);
paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
set = test_bit(bit, paddr); else
set = test_bit_le(bit, paddr);
kunmap_local(paddr); return set;
}
/* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to
* sync the dirty pages of the bitmap file to disk */ staticvoid __bitmap_unplug(struct bitmap *bitmap)
{ unsignedlong i; int dirty, need_write; int writing = 0;
if (!__bitmap_enabled(bitmap)) return;
/* look at each page to see if there are any set bits that need to be
* flushed out to disk */ for (i = 0; i < bitmap->storage.file_pages; i++) {
dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
need_write = test_and_clear_page_attr(bitmap, i,
BITMAP_PAGE_NEEDWRITE); if (dirty || need_write) { if (!writing) {
md_bitmap_wait_writes(bitmap);
mddev_add_trace_msg(bitmap->mddev, "md bitmap_unplug");
}
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
filemap_write_page(bitmap, i, false);
writing = 1;
}
} if (writing)
md_bitmap_wait_writes(bitmap);
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
md_bitmap_file_kick(bitmap);
}
if (sync)
__bitmap_unplug(bitmap); else
bitmap_unplug_async(bitmap);
}
staticvoid md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
/* * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory * mapping of the bitmap file. * * Special case: If there's no bitmap file, or if the bitmap file had been * previously kicked from the array, we mark all the bits as 1's in order to * cause a full resync. * * We ignore all bits for sectors that end earlier than 'start'. * This is used when reading an out-of-date bitmap.
*/ staticint md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
{ bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags); struct mddev *mddev = bitmap->mddev; unsignedlong chunks = bitmap->counts.chunks; struct bitmap_storage *store = &bitmap->storage; struct file *file = store->file; unsignedlong node_offset = 0; unsignedlong bit_cnt = 0; unsignedlong i; int ret;
if (!file && !mddev->bitmap_info.offset) { /* No permanent bitmap - fill with '1s'. */
store->filemap = NULL;
store->file_pages = 0; for (i = 0; i < chunks ; i++) { /* if the disk bit is set, set the memory bit */ int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift)
>= start);
md_bitmap_set_memory_bits(bitmap,
(sector_t)i << bitmap->counts.chunkshift,
needed);
} return 0;
}
if (file && i_size_read(file->f_mapping->host) < store->bytes) {
pr_warn("%s: bitmap file too short %lu < %lu\n",
bmname(bitmap),
(unsignedlong) i_size_read(file->f_mapping->host),
store->bytes);
ret = -ENOSPC; goto err;
}
if (mddev_is_clustered(mddev))
node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
for (i = 0; i < store->file_pages; i++) { struct page *page = store->filemap[i]; int count;
/* unmap the old page, we're done with it */ if (i == store->file_pages - 1)
count = store->bytes - i * PAGE_SIZE; else
count = PAGE_SIZE;
if (file)
ret = read_file_page(file, i, bitmap, count, page); else
ret = read_sb_page(mddev, 0, page, i + node_offset,
count); if (ret) goto err;
}
if (outofdate) {
pr_warn("%s: bitmap file is out of date, doing full recovery\n",
bmname(bitmap));
for (i = 0; i < store->file_pages; i++) { struct page *page = store->filemap[i]; unsignedlong offset = 0; void *paddr;
if (i == 0 && !mddev->bitmap_info.external)
offset = sizeof(bitmap_super_t);
/* * If the bitmap is out of date, dirty the whole page * and write it out
*/
paddr = kmap_local_page(page);
memset(paddr + offset, 0xff, PAGE_SIZE - offset);
kunmap_local(paddr);
filemap_write_page(bitmap, i, true); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
ret = -EIO; goto err;
}
}
}
for (i = 0; i < chunks; i++) { struct page *page = filemap_get_page(&bitmap->storage, i); unsignedlong bit = file_page_offset(&bitmap->storage, i); void *paddr; bool was_set;
if (was_set) { /* if the disk bit is set, set the memory bit */ int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
>= start);
md_bitmap_set_memory_bits(bitmap,
(sector_t)i << bitmap->counts.chunkshift,
needed);
bit_cnt++;
}
}
pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",
bmname(bitmap), store->file_pages,
bit_cnt, chunks);
/* just flag bitmap pages as needing to be written. */ staticvoid bitmap_write_all(struct mddev *mddev)
{ int i; struct bitmap *bitmap = mddev->bitmap;
if (!bitmap || !bitmap->storage.filemap) return;
/* Only one copy, so nothing needed */ if (bitmap->storage.file) return;
for (i = 0; i < bitmap->storage.file_pages; i++)
set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
bitmap->allclean = 0;
}
if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT)
thread->timeout = timeout;
out:
rcu_read_unlock();
}
/* * bitmap daemon -- periodically wakes up to clean bits and flush pages * out to disk
*/ staticvoid bitmap_daemon_work(struct mddev *mddev)
{ struct bitmap *bitmap; unsignedlong j; unsignedlong nextpage;
sector_t blocks; struct bitmap_counts *counts;
/* Use a mutex to guard daemon_work against * bitmap_destroy.
*/
mutex_lock(&mddev->bitmap_info.mutex);
bitmap = mddev->bitmap; if (bitmap == NULL) {
mutex_unlock(&mddev->bitmap_info.mutex); return;
} if (time_before(jiffies, bitmap->daemon_lastrun
+ mddev->bitmap_info.daemon_sleep)) goto done;
/* Any file-page which is PENDING now needs to be written. * So set NEEDWRITE now, then after we make any last-minute changes * we will write it.
*/ for (j = 0; j < bitmap->storage.file_pages; j++) if (test_and_clear_page_attr(bitmap, j,
BITMAP_PAGE_PENDING))
set_page_attr(bitmap, j,
BITMAP_PAGE_NEEDWRITE);
if (bitmap->need_sync &&
mddev->bitmap_info.external == 0) { /* Arrange for superblock update as well as
* other changes */
bitmap_super_t *sb;
bitmap->need_sync = 0; if (bitmap->storage.filemap) {
sb = kmap_local_page(bitmap->storage.sb_page);
sb->events_cleared =
cpu_to_le64(bitmap->events_cleared);
kunmap_local(sb);
set_page_attr(bitmap, 0,
BITMAP_PAGE_NEEDWRITE);
}
} /* Now look at the bitmap counters and if any are '2' or '1', * decrement and handle accordingly.
*/
counts = &bitmap->counts;
spin_lock_irq(&counts->lock);
nextpage = 0; for (j = 0; j < counts->chunks; j++) {
bitmap_counter_t *bmc;
sector_t block = (sector_t)j << counts->chunkshift;
bmc = md_bitmap_get_counter(counts, block, &blocks, 0); if (!bmc) {
j |= PAGE_COUNTER_MASK; continue;
} if (*bmc == 1 && !bitmap->need_sync) { /* We can clear the bit */
*bmc = 0;
md_bitmap_count_page(counts, block, -1);
md_bitmap_file_clear_bit(bitmap, block);
} elseif (*bmc && *bmc <= 2) {
*bmc = 1;
md_bitmap_set_pending(counts, block);
bitmap->allclean = 0;
}
}
spin_unlock_irq(&counts->lock);
md_bitmap_wait_writes(bitmap); /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. * DIRTY pages need to be written by bitmap_unplug so it can wait * for them. * If we find any DIRTY page we stop there and let bitmap_unplug * handle all the rest. This is important in the case where * the first blocking holds the superblock and it has been updated. * We mustn't write any other blocks before the superblock.
*/ for (j = 0;
j < bitmap->storage.file_pages
&& !test_bit(BITMAP_STALE, &bitmap->flags);
j++) { if (test_page_attr(bitmap, j,
BITMAP_PAGE_DIRTY)) /* bitmap_unplug will handle the rest */ break; if (bitmap->storage.filemap &&
test_and_clear_page_attr(bitmap, j,
BITMAP_PAGE_NEEDWRITE))
filemap_write_page(bitmap, j, false);
}
done: if (bitmap->allclean == 0)
mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
mutex_unlock(&mddev->bitmap_info.mutex);
}
static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
sector_t offset, sector_t *blocks, int create)
__releases(bitmap->lock)
__acquires(bitmap->lock)
{ /* If 'create', we might release the lock and reclaim it. * The lock must have been taken with interrupts enabled. * If !create, we don't release the lock.
*/
sector_t chunk = offset >> bitmap->chunkshift; unsignedlong page = chunk >> PAGE_COUNTER_SHIFT; unsignedlong pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
sector_t csize = ((sector_t)1) << bitmap->chunkshift; int err;
if (page >= bitmap->pages) { /* * This can happen if bitmap_start_sync goes beyond * End-of-device while looking for a whole page or * user set a huge number to sysfs bitmap_set_bits.
*/
*blocks = csize - (offset & (csize - 1)); return NULL;
}
err = md_bitmap_checkpage(bitmap, page, create, 0);
if (bitmap->bp[page].hijacked) { /* hijacked pointer */ /* should we use the first or second counter field
* of the hijacked pointer? */ int hi = (pageoff > PAGE_COUNTER_MASK); return &((bitmap_counter_t *)
&bitmap->bp[page].map)[hi];
} else/* page is allocated */ return (bitmap_counter_t *)
&(bitmap->bp[page].map[pageoff]);
}
if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) {
DEFINE_WAIT(__wait); /* note that it is safe to do the prepare_to_wait * after the test as long as we do it before dropping * the spinlock.
*/
prepare_to_wait(&bitmap->overflow_wait, &__wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&bitmap->counts.lock);
schedule();
finish_wait(&bitmap->overflow_wait, &__wait); continue;
}
switch (*bmc) { case 0:
md_bitmap_file_set_bit(bitmap, offset);
md_bitmap_count_page(&bitmap->counts, offset, 1);
fallthrough; case 1:
*bmc = 2;
}
if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
*blocks = 1024; returntrue; /* always resync if no bitmap */
}
spin_lock_irq(&bitmap->counts.lock);
staticbool bitmap_start_sync(struct mddev *mddev, sector_t offset,
sector_t *blocks, bool degraded)
{ /* bitmap_start_sync must always report on multiples of whole * pages, otherwise resync (which is very PAGE_SIZE based) will * get confused. * So call __bitmap_start_sync repeatedly (if needed) until * At least PAGE_SIZE>>9 blocks are covered. * Return the 'or' of the result.
*/ bool rv = false;
sector_t blocks1;
staticvoid bitmap_close_sync(struct mddev *mddev)
{ /* Sync has finished, and any bitmap chunks that weren't synced * properly have been aborted. It remains to us to clear the * RESYNC bit wherever it is still on
*/
sector_t sector = 0;
sector_t blocks; struct bitmap *bitmap = mddev->bitmap;
for (sector = old_lo; sector < new_lo; ) {
__bitmap_end_sync(bitmap, sector, &blocks, false);
sector += blocks;
}
WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n");
for (sector = old_hi; sector < new_hi; ) {
bitmap_start_sync(mddev, sector, &blocks, false);
sector += blocks;
}
WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n");
}
staticvoid md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
{ /* For each chunk covered by any of these sectors, set the * counter to 2 and possibly set resync_needed. They should all * be 0 at this point
*/
md_bitmap_set_memory_bits(bitmap, sec, 1);
md_bitmap_file_set_bit(bitmap, sec); if (sec < bitmap->mddev->resync_offset) /* We are asserting that the array is dirty, * so move the resync_offset address back so * that it is obvious that it is dirty
*/
bitmap->mddev->resync_offset = sec;
}
}
/* run the daemon_work three time to ensure everything is flushed * that can be
*/
sleep = mddev->bitmap_info.daemon_sleep * 2;
bitmap->daemon_lastrun -= sleep;
bitmap_daemon_work(mddev);
bitmap->daemon_lastrun -= sleep;
bitmap_daemon_work(mddev);
bitmap->daemon_lastrun -= sleep;
bitmap_daemon_work(mddev); if (mddev->bitmap_info.external)
md_super_wait(mddev);
bitmap_update_sb(bitmap);
}
/* wait for behind writes to complete */ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
mdname(mddev)); /* need to kick something here to make sure I/O goes? */
wait_event(bitmap->behind_wait,
atomic_read(&bitmap->behind_writes) == 0);
}
}
bitmap_wait_behind_writes(mddev); if (!mddev->serialize_policy)
mddev_destroy_serial_pool(mddev, NULL);
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
mddev->bitmap = NULL; /* disconnect from the md device */
spin_unlock(&mddev->lock);
mutex_unlock(&mddev->bitmap_info.mutex);
mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true);
md_bitmap_free(bitmap);
}
/* * initialize the bitmap structure * if this returns an error, bitmap_destroy must be called to do clean up * once mddev->bitmap is set
*/ staticstruct bitmap *__bitmap_create(struct mddev *mddev, int slot)
{ struct bitmap *bitmap;
sector_t blocks = mddev->resync_max_sectors; struct file *file = mddev->bitmap_info.file; int err; struct kernfs_node *bm = NULL;
BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
BUG_ON(file && mddev->bitmap_info.offset);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
pr_notice("md/raid:%s: array with journal cannot have bitmap\n",
mdname(mddev)); return ERR_PTR(-EBUSY);
}
bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) return ERR_PTR(-ENOMEM);
if (mddev->kobj.sd)
bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); if (bm) {
bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
sysfs_put(bm);
} else
bitmap->sysfs_can_clear = NULL;
bitmap->storage.file = file; if (file) {
get_file(file); /* As future accesses to this file will use bmap, * and bypass the page cache, we must sync the file * first.
*/
vfs_fsync(file, 1);
} /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ if (!mddev->bitmap_info.external) { /* * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is * instructing us to create a new on-disk bitmap instance.
*/ if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags))
err = md_bitmap_new_disk_sb(bitmap); else
err = md_bitmap_read_sb(bitmap);
} else {
err = 0; if (mddev->bitmap_info.chunksize == 0 ||
mddev->bitmap_info.daemon_sleep == 0) /* chunksize and time_base need to be
* set first. */
err = -EINVAL;
} if (err) goto error;
if (mddev_is_clustered(mddev))
mddev->cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
/* Clear out old bitmap info first: Either there is none, or we * are resuming after someone else has possibly changed things, * so we should forget old cached info. * All chunks should be clean, but some might need_sync.
*/ while (sector < mddev->resync_max_sectors) {
sector_t blocks;
bitmap_start_sync(mddev, sector, &blocks, false);
sector += blocks;
}
bitmap_close_sync(mddev);
if (mddev->degraded == 0
|| bitmap->events_cleared == mddev->events) /* no need to keep dirty bits to optimise a
* re-add of a missing device */
start = mddev->resync_offset;
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
err = -EIO;
out: return err;
}
/* caller need to free returned bitmap with md_bitmap_free() */ staticvoid *bitmap_get_from_slot(struct mddev *mddev, int slot)
{ int rv = 0; struct bitmap *bitmap;
/* Loads the bitmap associated with slot and copies the resync information * to our bitmap
*/ staticint bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low,
sector_t *high, bool clear_bits)
{ int rv = 0, i, j;
sector_t block, lo = 0, hi = 0; struct bitmap_counts *counts; struct bitmap *bitmap;
bitmap = bitmap_get_from_slot(mddev, slot); if (IS_ERR(bitmap)) {
pr_err("%s can't get bitmap from slot %d\n", __func__, slot); return -1;
}
counts = &bitmap->counts; for (j = 0; j < counts->chunks; j++) {
block = (sector_t)j << counts->chunkshift; if (md_bitmap_file_test_bit(bitmap, block)) { if (!lo)
lo = block;
hi = block;
md_bitmap_file_clear_bit(bitmap, block);
md_bitmap_set_memory_bits(mddev->bitmap, block, 1);
md_bitmap_file_set_bit(mddev->bitmap, block);
}
}
if (clear_bits) {
bitmap_update_sb(bitmap); /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs
* BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ for (i = 0; i < bitmap->storage.file_pages; i++) if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING))
set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
__bitmap_unplug(bitmap);
}
__bitmap_unplug(mddev->bitmap);
*low = lo;
*high = hi;
md_bitmap_free(bitmap);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.