/* * A metadata commit and the actions taken in case it fails should run * as a single atomic step.
*/ struct mutex commit_lock;
struct dm_clone_metadata *cmd;
/* Region hydration hash table */ struct hash_table_bucket *ht;
atomic_t ios_in_flight;
wait_queue_head_t hydration_stopped;
mempool_t hydration_pool;
unsignedlong last_commit_jiffies;
/* * We defer incoming WRITE bios for regions that are not hydrated, * until after these regions have been hydrated. * * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the * metadata have been committed.
*/
spinlock_t lock; struct bio_list deferred_bios; struct bio_list deferred_discard_bios; struct bio_list deferred_flush_bios; struct bio_list deferred_flush_completions;
/* Maximum number of regions being copied during background hydration. */ unsignedint hydration_threshold;
/* Number of regions to batch together during background hydration. */ unsignedint hydration_batch_size;
/* Which region to hydrate next */ unsignedlong hydration_offset;
atomic_t hydrations_in_flight;
/* * Save a copy of the table line rather than reconstructing it for the * status.
*/ unsignedint nr_ctr_args; constchar **ctr_args;
/* * dm_clone_reload_in_core_bitset() may run concurrently with either * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but * it's safe as we have already set the metadata to read-only mode.
*/
__reload_in_core_bitset(clone);
}
/* Wake up anyone waiting for region hydrations to stop */ staticinlinevoid wakeup_hydration_waiters(struct clone *clone)
{
wake_up_all(&clone->hydration_stopped);
}
/* Get the address of the region in sectors */ staticinline sector_t region_to_sector(struct clone *clone, unsignedlong region_nr)
{ return ((sector_t)region_nr << clone->region_shift);
}
/* Get the region number of the bio */ staticinlineunsignedlong bio_to_region(struct clone *clone, struct bio *bio)
{ return (bio->bi_iter.bi_sector >> clone->region_shift);
}
/* Get the region range covered by the bio */ staticvoid bio_region_range(struct clone *clone, struct bio *bio, unsignedlong *rs, unsignedlong *nr_regions)
{ unsignedlong end;
*rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
end = bio_end_sector(bio) >> clone->region_shift;
if (*rs >= end)
*nr_regions = 0; else
*nr_regions = end - *rs;
}
/* Check whether a bio overwrites a region */ staticinlinebool is_overwrite_bio(struct clone *clone, struct bio *bio)
{ return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size);
}
staticvoid fail_bios(struct bio_list *bios, blk_status_t status)
{ struct bio *bio;
while ((bio = bio_list_pop(bios)))
submit_bio_noacct(bio);
blk_finish_plug(&plug);
}
/* * Submit bio to the underlying device. * * If the bio triggers a commit, delay it, until after the metadata have been * committed. * * NOTE: The bio remapping must be performed by the caller.
*/ staticvoid issue_bio(struct clone *clone, struct bio *bio)
{ if (!bio_triggers_commit(clone, bio)) {
submit_bio_noacct(bio); return;
}
/* * If the metadata mode is RO or FAIL we won't be able to commit the * metadata, so we complete the bio with an error.
*/ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
bio_io_error(bio); return;
}
/* * Batch together any bios that trigger commits and then issue a single * commit for them in process_deferred_flush_bios().
*/
spin_lock_irq(&clone->lock);
bio_list_add(&clone->deferred_flush_bios, bio);
spin_unlock_irq(&clone->lock);
wake_worker(clone);
}
/* * Remap bio to the destination device and submit it. * * If the bio triggers a commit, delay it, until after the metadata have been * committed.
*/ staticvoid remap_and_issue(struct clone *clone, struct bio *bio)
{
remap_to_dest(clone, bio);
issue_bio(clone, bio);
}
/* * Issue bios that have been deferred until after their region has finished * hydrating. * * We delegate the bio submission to the worker thread, so this is safe to call * from interrupt context.
*/ staticvoid issue_deferred_bios(struct clone *clone, struct bio_list *bios)
{ struct bio *bio; unsignedlong flags; struct bio_list flush_bios = BIO_EMPTY_LIST; struct bio_list normal_bios = BIO_EMPTY_LIST;
if (bio_list_empty(bios)) return;
while ((bio = bio_list_pop(bios))) { if (bio_triggers_commit(clone, bio))
bio_list_add(&flush_bios, bio); else
bio_list_add(&normal_bios, bio);
}
staticvoid complete_overwrite_bio(struct clone *clone, struct bio *bio)
{ unsignedlong flags;
/* * If the bio has the REQ_FUA flag set we must commit the metadata * before signaling its completion. * * complete_overwrite_bio() is only called by hydration_complete(), * after having successfully updated the metadata. This means we don't * need to call dm_clone_changed_this_transaction() to check if the * metadata has changed and thus we can avoid taking the metadata spin * lock.
*/ if (!(bio->bi_opf & REQ_FUA)) {
bio_endio(bio); return;
}
/* * If the metadata mode is RO or FAIL we won't be able to commit the * metadata, so we complete the bio with an error.
*/ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
bio_io_error(bio); return;
}
/* * Batch together any bios that trigger commits and then issue a single * commit for them in process_deferred_flush_bios().
*/
spin_lock_irqsave(&clone->lock, flags);
bio_list_add(&clone->deferred_flush_completions, bio);
spin_unlock_irqrestore(&clone->lock, flags);
/* * If the destination device supports discards, remap and trim the * discard bio and pass it down. Otherwise complete the bio * immediately.
*/ if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
remap_to_dest(clone, bio);
bio_region_range(clone, bio, &rs, &nr_regions);
trim_bio(bio, region_to_sector(clone, rs),
nr_regions << clone->region_shift);
submit_bio_noacct(bio);
} else
bio_endio(bio);
}
bio_region_range(clone, bio, &rs, &nr_regions); if (!nr_regions) {
bio_endio(bio); return;
}
if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs ||
(rs + nr_regions) > clone->nr_regions)) {
DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)",
clone_device_name(clone), rs, nr_regions,
clone->nr_regions,
(unsignedlonglong)bio->bi_iter.bi_sector,
bio_sectors(bio));
bio_endio(bio); return;
}
/* * The covered regions are already hydrated so we just need to pass * down the discard.
*/ if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) {
complete_discard_bio(clone, bio, true); return;
}
/* * If the metadata mode is RO or FAIL we won't be able to update the * metadata for the regions covered by the discard so we just ignore * it.
*/ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
bio_endio(bio); return;
}
struct bio *overwrite_bio;
bio_end_io_t *overwrite_bio_end_io;
struct bio_list deferred_bios;
blk_status_t status;
/* Used by hydration batching */ struct list_head list;
/* Used by hydration hash table */ struct hlist_node h;
};
/* * Hydration hash table implementation. * * Ideally we would like to use list_bl, which uses bit spin locks and employs * the least significant bit of the list head to lock the corresponding bucket, * reducing the memory overhead for the locks. But, currently, list_bl and bit * spin locks don't support IRQ safe versions. Since we have to take the lock * in both process and interrupt context, we must fall back to using regular * spin locks; one per hash table bucket.
*/ struct hash_table_bucket { struct hlist_head head;
/* Spinlock protecting the bucket */
spinlock_t lock;
};
/* * Search hash table for a hydration with hd->region_nr == region_nr * * NOTE: Must be called with the bucket lock held
*/ staticstruct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket, unsignedlong region_nr)
{ struct dm_clone_region_hydration *hd;
/* * Insert a hydration into the hash table. * * NOTE: Must be called with the bucket lock held.
*/ staticinlinevoid __insert_region_hydration(struct hash_table_bucket *bucket, struct dm_clone_region_hydration *hd)
{
hlist_add_head(&hd->h, &bucket->head);
}
/* * This function inserts a hydration into the hash table, unless someone else * managed to insert a hydration for the same region first. In the latter case * it returns the existing hydration descriptor for this region. * * NOTE: Must be called with the hydration hash table lock held.
*/ staticstruct dm_clone_region_hydration *
__find_or_insert_region_hydration(struct hash_table_bucket *bucket, struct dm_clone_region_hydration *hd)
{ struct dm_clone_region_hydration *hd2;
hd2 = __hash_find(bucket, hd->region_nr); if (hd2) return hd2;
/* * Allocate a hydration from the hydration mempool. * This might block but it can't fail.
*/
hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO);
hd->clone = clone;
/* * Update dm-clone's metadata after a region has finished hydrating and remove * hydration from the hash table.
*/ staticint hydration_update_metadata(struct dm_clone_region_hydration *hd)
{ int r = 0; unsignedlong flags; struct hash_table_bucket *bucket; struct clone *clone = hd->clone;
if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
r = -EPERM;
/* Update the metadata */ if (likely(!r) && hd->status == BLK_STS_OK)
r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr);
/* Continue background hydration, if there is no I/O in-flight */ if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
!atomic_read(&clone->ios_in_flight))
wake_worker(clone);
}
if (region_end == clone->nr_regions - 1) { /* * The last region of the target might be smaller than * region_size.
*/
tail_size = clone->ti->len & (region_size - 1); if (!tail_size)
tail_size = region_size;
} else {
tail_size = region_size;
}
staticvoid hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio)
{ /* * We don't need to save and restore bio->bi_private because device * mapper core generates a new bio for us to use, with clean * bi_private.
*/
hd->overwrite_bio = bio;
hd->overwrite_bio_end_io = bio->bi_end_io;
/* * Hydrate bio's region. * * This function starts the hydration of the bio's region and puts the bio in * the list of deferred bios for this region. In case, by the time this * function is called, the region has finished hydrating it's submitted to the * destination device. * * NOTE: The bio remapping must be performed by the caller.
*/ staticvoid hydrate_bio_region(struct clone *clone, struct bio *bio)
{ unsignedlong region_nr; struct hash_table_bucket *bucket; struct dm_clone_region_hydration *hd, *hd2;
hd = __hash_find(bucket, region_nr); if (hd) { /* Someone else is hydrating the region */
bio_list_add(&hd->deferred_bios, bio);
bucket_unlock_irq(bucket); return;
}
if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { /* The region has been hydrated */
bucket_unlock_irq(bucket);
issue_bio(clone, bio); return;
}
/* * We must allocate a hydration descriptor and start the hydration of * the corresponding region.
*/
bucket_unlock_irq(bucket);
hd = alloc_hydration(clone);
hydration_init(hd, region_nr);
bucket_lock_irq(bucket);
/* Check if the region has been hydrated in the meantime. */ if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
bucket_unlock_irq(bucket);
free_hydration(hd);
issue_bio(clone, bio); return;
}
hd2 = __find_or_insert_region_hydration(bucket, hd); if (hd2 != hd) { /* Someone else started the region's hydration. */
bio_list_add(&hd2->deferred_bios, bio);
bucket_unlock_irq(bucket);
free_hydration(hd); return;
}
/* * If the metadata mode is RO or FAIL then there is no point starting a * hydration, since we will not be able to update the metadata when the * hydration finishes.
*/ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
hlist_del(&hd->h);
bucket_unlock_irq(bucket);
free_hydration(hd);
bio_io_error(bio); return;
}
/* * Start region hydration. * * If a bio overwrites a region, i.e., its size is equal to the * region's size, then we don't need to copy the region from the source * to the destination device.
*/ if (is_overwrite_bio(clone, bio)) {
bucket_unlock_irq(bucket);
hydration_overwrite(hd, bio);
} else {
bio_list_add(&hd->deferred_bios, bio);
bucket_unlock_irq(bucket);
hydration_copy(hd, 1);
}
}
/* * Batch region hydrations. * * To better utilize device bandwidth we batch together the hydration of * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which * is good for small, random write performance (because of the overwriting of * un-hydrated regions) and at the same time issue big copy requests to kcopyd * to achieve high hydration bandwidth.
*/ struct batch_info { struct dm_clone_region_hydration *head; unsignedint nr_batched_regions;
};
if (batch->head) { /* Try to extend the current batch */ if (batch->nr_batched_regions < max_batch_size &&
(batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) {
list_add_tail(&hd->list, &batch->head->list);
batch->nr_batched_regions++;
hd = NULL;
}
/* Check if we should issue the current batch */ if (batch->nr_batched_regions >= max_batch_size || hd) {
hydration_copy(batch->head, batch->nr_batched_regions);
batch->head = NULL;
batch->nr_batched_regions = 0;
}
}
if (!hd) return;
/* We treat max batch sizes of zero and one equivalently */ if (max_batch_size <= 1) {
hydration_copy(hd, 1); return;
}
/* Start a new batch */
BUG_ON(!list_empty(&hd->list));
batch->head = hd;
batch->nr_batched_regions = 1;
}
/* * This function searches for regions that still reside in the source device * and starts their hydration.
*/ staticvoid do_hydration(struct clone *clone)
{ unsignedint current_volume; unsignedlong offset, nr_regions = clone->nr_regions;
if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) return;
if (dm_clone_is_hydration_done(clone->cmd)) return;
/* * Avoid race with device suspension.
*/
atomic_inc(&clone->hydrations_in_flight);
/* * Make sure atomic_inc() is ordered before test_bit(), otherwise we * might race with clone_postsuspend() and start a region hydration * after the target has been suspended. * * This is paired with the smp_mb__after_atomic() in * clone_postsuspend().
*/
smp_mb__after_atomic();
if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) goto out;
/* Update the metadata */
bio_list_for_each(bio, &discards) {
bio_region_range(clone, bio, &rs, &nr_regions); /* * A discard request might cover regions that have been already * hydrated. There is no need to update the metadata for these * regions.
*/
r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions); if (unlikely(r)) break;
}
out:
blk_start_plug(&plug); while ((bio = bio_list_pop(&discards)))
complete_discard_bio(clone, bio, r == 0);
blk_finish_plug(&plug);
}
/* * If there are any deferred flush bios, we must commit the metadata * before issuing them or signaling their completion.
*/
spin_lock_irq(&clone->lock);
bio_list_merge_init(&bios, &clone->deferred_flush_bios);
bio_list_merge_init(&bio_completions,
&clone->deferred_flush_completions);
spin_unlock_irq(&clone->lock);
if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
!(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone))) return;
if (commit_metadata(clone, &dest_dev_flushed)) {
bio_list_merge(&bios, &bio_completions);
while ((bio = bio_list_pop(&bios)))
bio_io_error(bio);
return;
}
clone->last_commit_jiffies = jiffies;
while ((bio = bio_list_pop(&bio_completions)))
bio_endio(bio);
while ((bio = bio_list_pop(&bios))) { if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) { /* We just flushed the destination device as part of * the metadata commit, so there is no reason to send * another flush.
*/
bio_endio(bio);
} else {
submit_bio_noacct(bio);
}
}
}
/* * Commit periodically so that not too much unwritten data builds up. * * Also, restart background hydration, if it has been stopped by in-flight I/O.
*/ staticvoid do_waker(struct work_struct *work)
{ struct clone *clone = container_of(to_delayed_work(work), struct clone, waker);
/* * dm-clone interprets discards and performs a fast hydration of the * discarded regions, i.e., we skip the copy from the source device and * just mark the regions as hydrated.
*/ if (bio_op(bio) == REQ_OP_DISCARD) {
process_discard_bio(clone, bio); return DM_MAPIO_SUBMITTED;
}
/* * If the bio's region is hydrated, redirect it to the destination * device. * * If the region is not hydrated and the bio is a READ, redirect it to * the source device. * * Else, defer WRITE bio until after its region has been hydrated and * start the region's hydration immediately.
*/
region_nr = bio_to_region(clone, bio); if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
remap_and_issue(clone, bio); return DM_MAPIO_SUBMITTED;
} elseif (bio_data_dir(bio) == READ) {
remap_to_source(clone, bio); return DM_MAPIO_REMAPPED;
}
/* * Construct a clone device mapping: * * clone <metadata dev> <destination dev> <source dev> <region size> * [<#feature args> [<feature arg>]* [<#core args> [key value]*]] * * metadata dev: Fast device holding the persistent metadata * destination dev: The destination device, which will become a clone of the * source device * source dev: The read-only source device that gets cloned * region size: dm-clone unit size in sectors * * #feature args: Number of feature arguments passed * feature args: E.g. no_hydration, no_discard_passdown * * #core arguments: An even number of core arguments * core arguments: Key/value pairs for tuning the core * E.g. 'hydration_threshold 256'
*/ staticint parse_feature_args(struct dm_arg_set *as, struct clone *clone)
{ int r; unsignedint argc; constchar *arg_name; struct dm_target *ti = clone->ti;
conststruct dm_arg args = {
.min = 0,
.max = 2,
.error = "Invalid number of feature arguments"
};
/* No feature arguments supplied */ if (!as->argc) return 0;
r = dm_read_arg_group(&args, as, &argc, &ti->error); if (r) return r;
while (argc) {
arg_name = dm_shift_arg(as);
argc--;
r = dm_read_arg(&arg, as, ®ion_size, error); if (r) return r;
/* Check region size is a power of 2 */ if (!is_power_of_2(region_size)) {
*error = "Region size is not a power of 2"; return -EINVAL;
}
/* Validate the region size against the device logical block size */ if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) ||
region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) {
*error = "Region size is not a multiple of device logical block size"; return -EINVAL;
}
clone->region_size = region_size;
return 0;
}
staticint validate_nr_regions(unsignedlong n, char **error)
{ /* * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us * further to 2^31 regions.
*/ if (n > (1UL << 31)) {
*error = "Too many regions. Consider increasing the region size"; return -EINVAL;
}
r = dm_get_device(clone->ti, dm_shift_arg(as),
BLK_OPEN_READ | BLK_OPEN_WRITE, &clone->metadata_dev); if (r) {
*error = "Error opening metadata device"; return r;
}
metadata_dev_size = get_dev_size(clone->metadata_dev); if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING)
DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
clone->metadata_dev->bdev, DM_CLONE_METADATA_MAX_SECTORS);
/* Check for overflow */ if (nr_regions != (unsignedlong)nr_regions) {
ti->error = "Too many regions. Consider increasing the region size";
r = -EOVERFLOW; goto out_with_source_dev;
}
clone->nr_regions = nr_regions;
r = validate_nr_regions(clone->nr_regions, &ti->error); if (r) goto out_with_source_dev;
r = dm_set_target_max_io_len(ti, clone->region_size); if (r) {
ti->error = "Failed to set max io len"; goto out_with_source_dev;
}
r = parse_feature_args(&as, clone); if (r) goto out_with_source_dev;
r = parse_core_args(&as, clone); if (r) goto out_with_source_dev;
/* Load metadata */
clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len,
clone->region_size); if (IS_ERR(clone->cmd)) {
ti->error = "Failed to load metadata";
r = PTR_ERR(clone->cmd); goto out_with_source_dev;
}
__set_clone_mode(clone, CM_WRITE);
if (get_clone_mode(clone) != CM_WRITE) {
ti->error = "Unable to get write access to metadata, please check/repair metadata";
r = -EPERM; goto out_with_metadata;
}
clone->last_commit_jiffies = jiffies;
/* Allocate hydration hash table */
r = hash_table_init(clone); if (r) {
ti->error = "Failed to allocate hydration hash table"; goto out_with_metadata;
}
/* * To successfully suspend the device: * * - We cancel the delayed work for periodic commits and wait for * it to finish. * * - We stop the background hydration, i.e. we prevent new region * hydrations from starting. * * - We wait for any in-flight hydrations to finish. * * - We flush the workqueue. * * - We commit the metadata.
*/
cancel_delayed_work_sync(&clone->waker);
/* * Make sure set_bit() is ordered before atomic_read(), otherwise we * might race with do_hydration() and miss some started region * hydrations. * * This is paired with smp_mb__after_atomic() in do_hydration().
*/
smp_mb__after_atomic();
if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) { /* No passdown is done so we set our own virtual limits */
limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
limits->max_hw_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT,
clone->region_size); return;
}
/* * clone_iterate_devices() is stacking both the source and destination * device limits but discards aren't passed to the source device, so * inherit destination's limits.
*/
limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
limits->discard_granularity = dest_limits->discard_granularity;
limits->discard_alignment = dest_limits->discard_alignment;
limits->max_discard_segments = dest_limits->max_discard_segments;
}
/* * If the system-determined stacked limits are compatible with * dm-clone's region size (io_opt is a factor) do not override them.
*/ if (io_opt_sectors < clone->region_size ||
do_div(io_opt_sectors, clone->region_size)) {
limits->io_min = clone->region_size << SECTOR_SHIFT;
limits->io_opt = clone->region_size << SECTOR_SHIFT;
}
/* * If user space sets hydration_threshold to zero then the hydration * will stop. If at a later time the hydration_threshold is increased * we must restart the hydration process by waking up the worker.
*/
wake_worker(clone);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.