DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, "A percentage of time allocated for copy on write");
/* * The block size of the device holding pool data must be * between 64KB and 1GB.
*/ #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
/* * Device id is restricted to 24 bits.
*/ #define MAX_DEV_ID ((1 << 24) - 1)
/* * How do we handle breaking sharing of data blocks? * ================================================= * * We use a standard copy-on-write btree to store the mappings for the * devices (note I'm talking about copy-on-write of the metadata here, not * the data). When you take an internal snapshot you clone the root node * of the origin btree. After this there is no concept of an origin or a * snapshot. They are just two device trees that happen to point to the * same data blocks. * * When we get a write in we decide if it's to a shared data block using * some timestamp magic. If it is, we have to break sharing. * * Let's say we write to a shared block in what was the origin. The * steps are: * * i) plug io further to this physical block. (see bio_prison code). * * ii) quiesce any read io to that shared data block. Obviously * including all devices that share this block. (see dm_deferred_set code) * * iii) copy the data block to a newly allocate block. This step can be * missed out if the io covers the block. (schedule_copy). * * iv) insert the new mapping into the origin's btree * (process_prepared_mapping). This act of inserting breaks some * sharing of btree nodes between the two devices. Breaking sharing only * effects the btree of that specific device. Btrees for the other * devices that share the block never change. The btree for the origin * device as it was after the last commit is untouched, ie. we're using * persistent data structures in the functional programming sense. * * v) unplug io to this physical block, including the io that triggered * the breaking of sharing. * * Steps (ii) and (iii) occur in parallel. * * The metadata _doesn't_ need to be committed before the io continues. We * get away with this because the io is always written to a _new_ block. * If there's a crash, then: * * - The origin mapping will point to the old origin block (the shared * one). This will contain the data as it was before the io that triggered * the breaking of sharing came in. * * - The snap mapping still points to the old block. As it would after * the commit. * * The downside of this scheme is the timestamp magic isn't perfect, and * will continue to think that data block in the snapshot device is shared * even after the write to the origin has broken sharing. I suspect data * blocks will typically be shared by many different devices, so we're * breaking sharing n + 1 times, rather than n, where n is the number of * devices that reference this data block. At the moment I think the * benefits far, far outweigh the disadvantages.
*/
/* * A pool device ties together a metadata device and a data device. It * also provides the interface for creating and destroying internal * devices.
*/ struct dm_thin_new_mapping;
/* * The pool runs in various modes. Ordered in degraded order for comparisons.
*/ enum pool_mode {
PM_WRITE, /* metadata may be changed */
PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
/* * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
*/
PM_OUT_OF_METADATA_SPACE,
PM_READ_ONLY, /* metadata may not be changed */
if (mode == PM_OUT_OF_DATA_SPACE) { if (!pool->pf.error_if_no_space)
extra_desc = " (queue IO)"; else
extra_desc = " (error IO)";
}
dm_table_event(pool->ti->table);
DMINFO("%s: switching pool to %s%s mode",
dm_device_name(pool->pool_md),
descs[(int)mode], extra_desc ? : "");
}
/* * Target context for a pool.
*/ struct pool_c { struct dm_target *ti; struct pool *pool; struct dm_dev *data_dev; struct dm_dev *metadata_dev;
dm_block_t low_water_blocks; struct pool_features requested_pf; /* Features requested during table load */ struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
};
/* * Ensures the thin is not destroyed until the worker has finished * iterating the active_thins list.
*/
refcount_t refcount; struct completion can_destroy;
};
staticvoid end_discard(struct discard_op *op, int r)
{ if (op->bio) { /* * Even if one of the calls to issue_discard failed, we * need to wait for the chain to complete.
*/
bio_chain(op->bio, op->parent_bio);
op->bio->bi_opf = REQ_OP_DISCARD;
submit_bio(op->bio);
}
blk_finish_plug(&op->plug);
/* * Even if r is set, there could be sub discards in flight that we * need to wait for.
*/ if (r && !op->parent_bio->bi_status)
op->parent_bio->bi_status = errno_to_blk_status(r);
bio_endio(op->parent_bio);
}
/* * wake_worker() is used when new work is queued and when pool_resume is * ready to continue deferred IO processing.
*/ staticvoid wake_worker(struct pool *pool)
{
queue_work(pool->wq, &pool->worker);
}
staticint bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio, struct dm_bio_prison_cell **cell_result)
{ int r; struct dm_bio_prison_cell *cell_prealloc;
/* * Allocate a cell from the prison's mempool. * This might block but it can't fail.
*/
cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result); if (r) { /* * We reused an old cell; we can get rid of * the new one.
*/
dm_bio_prison_free_cell(pool->prison, cell_prealloc);
}
/* * A global list of pools that uses a struct mapped_device as a key.
*/ staticstruct dm_thin_pool_table { struct mutex mutex; struct list_head pools;
} dm_thin_pool_table;
staticvoid error_retry_list(struct pool *pool)
{
error_retry_list_with_code(pool, get_pool_io_error_code(pool));
}
/* * This section of code contains the logic for processing a thin device's IO. * Much of the code depends on pool object resources (lists, workqueues, etc) * but most is exclusively called from the thin target rather than the thin-pool * target.
*/
static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
{ struct pool *pool = tc->pool;
sector_t block_nr = bio->bi_iter.bi_sector;
if (block_size_is_power_of_two(pool))
block_nr >>= pool->sectors_per_block_shift; else
(void) sector_div(block_nr, pool->sectors_per_block);
return block_nr;
}
/* * Returns the _complete_ blocks that this bio covers.
*/ staticvoid get_bio_block_range(struct thin_c *tc, struct bio *bio,
dm_block_t *begin, dm_block_t *end)
{ struct pool *pool = tc->pool;
sector_t b = bio->bi_iter.bi_sector;
sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
b += pool->sectors_per_block - 1ull; /* so we round up */
if (block_size_is_power_of_two(pool)) {
b >>= pool->sectors_per_block_shift;
e >>= pool->sectors_per_block_shift;
} else {
(void) sector_div(b, pool->sectors_per_block);
(void) sector_div(e, pool->sectors_per_block);
}
if (e < b) { /* Can happen if the bio is within a single block. */
e = b;
}
*begin = b;
*end = e;
}
staticvoid remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
{ struct pool *pool = tc->pool;
sector_t bi_sector = bio->bi_iter.bi_sector;
staticvoid inc_all_io_entry(struct pool *pool, struct bio *bio)
{ struct dm_thin_endio_hook *h;
if (bio_op(bio) == REQ_OP_DISCARD) return;
h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
}
staticvoid issue(struct thin_c *tc, struct bio *bio)
{ struct pool *pool = tc->pool;
if (!bio_triggers_commit(tc, bio)) {
dm_submit_bio_remap(bio, NULL); return;
}
/* * Complete bio with an error if earlier I/O caused changes to * the metadata that can't be committed e.g, due to I/O errors * on the metadata device.
*/ if (dm_thin_aborted_changes(tc->td)) {
bio_io_error(bio); return;
}
/* * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios().
*/
spin_lock_irq(&pool->lock);
bio_list_add(&pool->deferred_flush_bios, bio);
spin_unlock_irq(&pool->lock);
}
/* * Track quiescing, copying and zeroing preparation actions. When this * counter hits zero the block is prepared and can be inserted into the * btree.
*/
atomic_t prepare_actions;
/* * If the bio covers the whole area of a block then we can avoid * zeroing or copying. Instead this bio is hooked. The bio will * still be in the cell, so care has to be taken to avoid issuing * the bio twice.
*/ struct bio *bio;
bio_end_io_t *saved_bi_end_io;
};
staticvoid __complete_mapping_preparation(struct dm_thin_new_mapping *m)
{ struct pool *pool = m->tc->pool;
if (atomic_dec_and_test(&m->prepare_actions)) {
list_add_tail(&m->list, &pool->prepared_mappings);
wake_worker(pool);
}
}
/* * This sends the bios in the cell, except the original holder, back * to the deferred_bios list.
*/ staticvoid cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{ struct pool *pool = tc->pool; unsignedlong flags; struct bio_list bios;
while ((bio = bio_list_pop(&cell->bios))) { if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
bio_list_add(&info->defer_bios, bio); else {
inc_all_io_entry(info->tc->pool, bio);
/* * We can't issue the bios with the bio prison lock * held, so we add them to a list to issue on * return from this function.
*/
bio_list_add(&info->issue_bios, bio);
}
}
}
/* * We have to be careful to inc any bios we're about to issue * before the cell is released, and avoid a race with new bios * being added to the cell.
*/
cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
&info, cell);
while ((bio = bio_list_pop(&info.defer_bios)))
thin_defer_bio(tc, bio);
while ((bio = bio_list_pop(&info.issue_bios)))
remap_and_issue(info.tc, bio, block);
}
staticvoid complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
{ struct pool *pool = tc->pool;
/* * If the bio has the REQ_FUA flag set we must commit the metadata * before signaling its completion.
*/ if (!bio_triggers_commit(tc, bio)) {
bio_endio(bio); return;
}
/* * Complete bio with an error if earlier I/O caused changes to the * metadata that can't be committed, e.g, due to I/O errors on the * metadata device.
*/ if (dm_thin_aborted_changes(tc->td)) {
bio_io_error(bio); return;
}
/* * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios().
*/
spin_lock_irq(&pool->lock);
bio_list_add(&pool->deferred_flush_completions, bio);
spin_unlock_irq(&pool->lock);
}
staticvoid process_prepared_mapping(struct dm_thin_new_mapping *m)
{ struct thin_c *tc = m->tc; struct pool *pool = tc->pool; struct bio *bio = m->bio; int r;
if (m->status) {
cell_error(pool, m->cell); goto out;
}
/* * Commit the prepared block into the mapping btree. * Any I/O for this block arriving after this point will get * remapped to it directly.
*/
r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block); if (r) {
metadata_operation_failed(pool, "dm_thin_insert_block", r);
cell_error(pool, m->cell); goto out;
}
/* * Release any bios held while the block was being provisioned. * If we are processing a write bio that completely covers the block, * we already processed it so can ignore it now when processing * the bios in the cell.
*/ if (bio) {
inc_remap_and_issue_cell(tc, m->cell, m->data_block);
complete_overwrite_bio(tc, bio);
} else {
inc_all_io_entry(tc->pool, m->cell->holder);
remap_and_issue(tc, m->cell->holder, m->data_block);
inc_remap_and_issue_cell(tc, m->cell, m->data_block);
}
staticvoid passdown_double_checking_shared_status(struct dm_thin_new_mapping *m, struct bio *discard_parent)
{ /* * We've already unmapped this range of blocks, but before we * passdown we have to check that these blocks are now unused.
*/ int r = 0; bool shared = true; struct thin_c *tc = m->tc; struct pool *pool = tc->pool;
dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin; struct discard_op op;
begin_discard(&op, tc, discard_parent); while (b != end) { /* find start of unmapped run */ for (; b < end; b++) {
r = dm_pool_block_is_shared(pool->pmd, b, &shared); if (r) goto out;
if (!shared) break;
}
if (b == end) break;
/* find end of run */ for (e = b + 1; e != end; e++) {
r = dm_pool_block_is_shared(pool->pmd, e, &shared); if (r) goto out;
staticvoid passdown_endio(struct bio *bio)
{ /* * It doesn't matter if the passdown discard failed, we still want * to unmap (we ignore err).
*/
queue_passdown_pt2(bio->bi_private);
bio_put(bio);
}
staticvoid process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
{ int r; struct thin_c *tc = m->tc; struct pool *pool = tc->pool; struct bio *discard_parent;
dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
/* * Only this thread allocates blocks, so we can be sure that the * newly unmapped blocks will not be allocated before the end of * the function.
*/
r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end); if (r) {
metadata_operation_failed(pool, "dm_thin_remove_range", r);
bio_io_error(m->bio);
cell_defer_no_holder(tc, m->cell);
mempool_free(m, &pool->mapping_pool); return;
}
/* * Increment the unmapped blocks. This prevents a race between the * passdown io and reallocation of freed blocks.
*/
r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end); if (r) {
metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
bio_io_error(m->bio);
cell_defer_no_holder(tc, m->cell);
mempool_free(m, &pool->mapping_pool); return;
}
staticvoid process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
{ int r; struct thin_c *tc = m->tc; struct pool *pool = tc->pool;
/* * The passdown has completed, so now we can decrement all those * unmapped blocks.
*/
r = dm_pool_dec_data_range(pool->pmd, m->data_block,
m->data_block + (m->virt_end - m->virt_begin)); if (r) {
metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
bio_io_error(m->bio);
} else
bio_endio(m->bio);
/* * quiesce action + copy action + an extra reference held for the * duration of this function (we may need to inc later for a * partial zero).
*/
atomic_set(&m->prepare_actions, 3);
if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
complete_mapping_preparation(m); /* already quiesced */
/* * IO to pool_dev remaps to the pool target's data_dev. * * If the whole block of data is being overwritten, we can issue the * bio immediately. Otherwise we use kcopyd to clone the data first.
*/ if (io_overwrites_block(pool, bio))
remap_and_issue_overwrite(tc, bio, data_dest, m); else { struct dm_io_region from, to;
/* * Do we need to zero a tail region?
*/ if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
atomic_inc(&m->prepare_actions);
ll_zero(tc, m,
data_dest * pool->sectors_per_block + len,
(data_dest + 1) * pool->sectors_per_block);
}
}
complete_mapping_preparation(m); /* drop our ref */
}
atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
m->tc = tc;
m->virt_begin = virt_block;
m->virt_end = virt_block + 1u;
m->data_block = data_block;
m->cell = cell;
/* * If the whole block of data is being overwritten or we are not * zeroing pre-existing data, we can issue the bio immediately. * Otherwise we use kcopyd to zero the data first.
*/ if (pool->pf.zero_new_blocks) { if (io_overwrites_block(pool, bio))
remap_and_issue_overwrite(tc, bio, data_block, m); else {
ll_zero(tc, m, data_block * pool->sectors_per_block,
(data_block + 1) * pool->sectors_per_block);
}
} else
process_prepared_mapping(m);
}
staticbool is_read_only(struct pool *pool)
{ return is_read_only_pool_mode(get_pool_mode(pool));
}
staticvoid check_for_metadata_space(struct pool *pool)
{ int r; constchar *ooms_reason = NULL;
dm_block_t nr_free;
r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free); if (r)
ooms_reason = "Could not get free metadata blocks"; elseif (!nr_free)
ooms_reason = "No free metadata blocks";
staticvoid check_for_data_space(struct pool *pool)
{ int r;
dm_block_t nr_free;
if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE) return;
r = dm_pool_get_free_block_count(pool->pmd, &nr_free); if (r) return;
if (nr_free) {
set_pool_mode(pool, PM_WRITE);
requeue_bios(pool);
}
}
/* * A non-zero return indicates read_only or fail_io mode. * Many callers don't care about the return value.
*/ staticint commit(struct pool *pool)
{ int r;
if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) return -EINVAL;
r = dm_pool_commit_metadata(pool->pmd); if (r)
metadata_operation_failed(pool, "dm_pool_commit_metadata", r); else {
check_for_metadata_space(pool);
check_for_data_space(pool);
}
return r;
}
staticvoid check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
{ if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
DMWARN("%s: reached low water mark for data device: sending event.",
dm_device_name(pool->pool_md));
spin_lock_irq(&pool->lock);
pool->low_water_triggered = true;
spin_unlock_irq(&pool->lock);
dm_table_event(pool->ti->table);
}
}
staticint alloc_data_block(struct thin_c *tc, dm_block_t *result)
{ int r;
dm_block_t free_blocks; struct pool *pool = tc->pool;
if (WARN_ON(get_pool_mode(pool) != PM_WRITE)) return -EINVAL;
r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); if (r) {
metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); return r;
}
check_low_water_mark(pool, free_blocks);
if (!free_blocks) { /* * Try to commit to see if that will free up some * more space.
*/
r = commit(pool); if (r) return r;
r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); if (r) {
metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); return r;
}
if (!free_blocks) {
set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); return -ENOSPC;
}
}
r = dm_pool_alloc_data_block(pool->pmd, result); if (r) { if (r == -ENOSPC)
set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); else
metadata_operation_failed(pool, "dm_pool_alloc_data_block", r); return r;
}
r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks); if (r) {
metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r); return r;
}
if (!free_blocks) { /* Let's commit before we use up the metadata reserve. */
r = commit(pool); if (r) return r;
}
return 0;
}
/* * If we have run out of space, queue bios until the device is * resumed, presumably after having been reloaded with more space.
*/ staticvoid retry_on_resume(struct bio *bio)
{ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct thin_c *tc = h->tc;
static blk_status_t should_error_unserviceable_bio(struct pool *pool)
{ enum pool_mode m = get_pool_mode(pool);
switch (m) { case PM_WRITE: /* Shouldn't get here */
DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); return BLK_STS_IOERR;
case PM_OUT_OF_DATA_SPACE: return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
case PM_OUT_OF_METADATA_SPACE: case PM_READ_ONLY: case PM_FAIL: return BLK_STS_IOERR; default: /* Shouldn't get here */
DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); return BLK_STS_IOERR;
}
}
staticvoid handle_unserviceable_bio(struct pool *pool, struct bio *bio)
{
blk_status_t error = should_error_unserviceable_bio(pool);
/* * We don't need to lock the data blocks, since there's no * passdown. We only lock data blocks for allocation and breaking sharing.
*/
m->tc = tc;
m->virt_begin = virt_cell->key.block_begin;
m->virt_end = virt_cell->key.block_end;
m->cell = virt_cell;
m->bio = virt_cell->holder;
if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
pool->process_prepared_discard(m);
}
staticvoid break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end, struct bio *bio)
{ struct pool *pool = tc->pool;
while (begin != end) {
r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
&data_begin, &maybe_shared); if (r) { /* * Silently fail, letting any mappings we've * created complete.
*/ break;
}
data_end = data_begin + (virt_end - virt_begin);
/* * Make sure the data region obeys the bio prison restrictions.
*/ while (data_begin < data_end) {
r = ensure_next_mapping(pool); if (r) return; /* we did our best */
/* This key is certainly within range given the above splitting */
(void) build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key); if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) { /* contention, we'll give up with this range */
data_begin += len; continue;
}
/* * IO may still be going to the destination block. We must * quiesce before we can do the removal.
*/
m = get_next_mapping(pool);
m->tc = tc;
m->maybe_shared = maybe_shared;
m->virt_begin = virt_begin;
m->virt_end = virt_begin + len;
m->data_block = data_begin;
m->cell = data_cell;
m->bio = bio;
/* * The parent bio must not complete before sub discard bios are * chained to it (see end_discard's bio_chain)! * * This per-mapping bi_remaining increment is paired with * the implicit decrement that occurs via bio_endio() in * end_discard().
*/
bio_inc_remaining(bio); if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
pool->process_prepared_discard(m);
/* * The virt_cell will only get freed once the origin bio completes. * This means it will remain locked while all the individual * passdown bios are in flight.
*/
h->cell = virt_cell;
break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
/* * We complete the bio now, knowing that the bi_remaining field * will prevent completion until the sub range discards have * completed.
*/
bio_endio(bio);
}
get_bio_block_range(tc, bio, &begin, &end); if (begin == end) { /* * The discard covers less than a block.
*/
bio_endio(bio); return;
}
if (unlikely(!build_key(tc->td, VIRTUAL, begin, end, &virt_key))) {
DMERR_LIMIT("Discard doesn't respect bio prison limits");
bio_endio(bio); return;
}
if (bio_detain(tc->pool, &virt_key, bio, &virt_cell)) { /* * Potential starvation issue: We're relying on the * fs/application being well behaved, and not trying to * send IO to a region at the same time as discarding it. * If they do this persistently then it's possible this * cell will never be granted.
*/ return;
}
tc->pool->process_discard_cell(tc, virt_cell);
}
staticvoid break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, struct dm_cell_key *key, struct dm_thin_lookup_result *lookup_result, struct dm_bio_prison_cell *cell)
{ int r;
dm_block_t data_block; struct pool *pool = tc->pool;
r = alloc_data_block(tc, &data_block); switch (r) { case 0:
schedule_internal_copy(tc, block, lookup_result->block,
data_block, cell, bio); break;
case -ENOSPC:
retry_bios_on_resume(pool, cell); break;
/* * If cell is already occupied, then sharing is already in the process * of being broken so we have nothing further to do here.
*/
build_data_key(tc->td, lookup_result->block, &key); if (bio_detain(pool, &key, bio, &data_cell)) {
cell_defer_no_holder(tc, virt_cell); return;
}
staticvoid provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, struct dm_bio_prison_cell *cell)
{ int r;
dm_block_t data_block; struct pool *pool = tc->pool;
/* * Remap empty bios (flushes) immediately, without provisioning.
*/ if (!bio->bi_iter.bi_size) {
inc_all_io_entry(pool, bio);
cell_defer_no_holder(tc, cell);
remap_and_issue(tc, bio, 0); return;
}
/* * Fill read bios with zeroes and complete them immediately.
*/ if (bio_data_dir(bio) == READ) {
zero_fill_bio(bio);
cell_defer_no_holder(tc, cell);
bio_endio(bio); return;
}
r = alloc_data_block(tc, &data_block); switch (r) { case 0: if (tc->origin_dev)
schedule_external_copy(tc, block, data_block, cell, bio); else
schedule_zero(tc, block, data_block, cell, bio); break;
case -ENOSPC:
retry_bios_on_resume(pool, cell); break;
/* * If cell is already occupied, then the block is already * being provisioned so we have nothing further to do here.
*/
build_virtual_key(tc->td, block, &key); if (bio_detain(pool, &key, bio, &cell)) return;
process_cell(tc, cell);
}
staticvoid __process_bio_read_only(struct thin_c *tc, struct bio *bio, struct dm_bio_prison_cell *cell)
{ int r; int rw = bio_data_dir(bio);
dm_block_t block = get_bio_block(tc, bio); struct dm_thin_lookup_result lookup_result;
r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { case 0: if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
handle_unserviceable_bio(tc->pool, bio); if (cell)
cell_defer_no_holder(tc, cell);
} else {
inc_all_io_entry(tc->pool, bio);
remap_and_issue(tc, bio, lookup_result.block); if (cell)
inc_remap_and_issue_cell(tc, cell, lookup_result.block);
} break;
case -ENODATA: if (cell)
cell_defer_no_holder(tc, cell); if (rw != READ) {
handle_unserviceable_bio(tc->pool, bio); break;
}
if (tc->origin_dev) {
inc_all_io_entry(tc->pool, bio);
remap_to_origin_and_issue(tc, bio); break;
}
/* * FIXME: should we also commit due to size of transaction, measured in * metadata blocks?
*/ staticint need_commit_due_to_time(struct pool *pool)
{ return !time_in_range(jiffies, pool->last_commit_jiffies,
pool->last_commit_jiffies + COMMIT_PERIOD);
}
blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { /* * If we've got no free new_mapping structs, and processing * this bio might require one, we pause until there are some * prepared mappings to process.
*/ if (ensure_next_mapping(pool)) {
spin_lock_irq(&tc->lock);
bio_list_add(&tc->deferred_bio_list, bio);
bio_list_merge(&tc->deferred_bio_list, &bios);
spin_unlock_irq(&tc->lock); break;
}
if (bio_op(bio) == REQ_OP_DISCARD)
pool->process_discard(tc, bio); else
pool->process_bio(tc, bio);
for (i = 0; i < count; i++) {
cell = pool->cell_sort_array[i];
BUG_ON(!cell->holder);
/* * If we've got no free new_mapping structs, and processing * this bio might require one, we pause until there are some * prepared mappings to process.
*/ if (ensure_next_mapping(pool)) { for (j = i; j < count; j++)
list_add(&pool->cell_sort_array[j]->user_list, &cells);
/* * We can't hold rcu_read_lock() around code that can block. So we * find a thin with the rcu lock held; bump a refcount; then drop * the lock.
*/ staticstruct thin_c *get_first_thin(struct pool *pool)
{ struct thin_c *tc = NULL;
/* * If there are any deferred flush bios, we must commit the metadata * before issuing them or signaling their completion.
*/
bio_list_init(&bios);
bio_list_init(&bio_completions);
if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) return;
if (commit(pool)) {
bio_list_merge(&bios, &bio_completions);
while ((bio = bio_list_pop(&bios)))
bio_io_error(bio); return;
}
pool->last_commit_jiffies = jiffies;
while ((bio = bio_list_pop(&bio_completions)))
bio_endio(bio);
while ((bio = bio_list_pop(&bios))) { /* * The data device was flushed as part of metadata commit, * so complete redundant flushes immediately.
*/ if (bio->bi_opf & REQ_PREFLUSH)
bio_endio(bio); else
dm_submit_bio_remap(bio, NULL);
}
}
staticvoid do_worker(struct work_struct *ws)
{ struct pool *pool = container_of(ws, struct pool, worker);
/* * We want to commit periodically so that not too much * unwritten data builds up.
*/ staticvoid do_waker(struct work_struct *ws)
{ struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
/* * We're holding onto IO to allow userland time to react. After the * timeout either the pool will have been resized (and thus back in * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
*/ staticvoid do_no_space_timeout(struct work_struct *ws)
{ struct pool *pool = container_of(to_delayed_work(ws), struct pool,
no_space_timeout);
/* * Never allow the pool to transition to PM_WRITE mode if user * intervention is required to verify metadata and data consistency.
*/ if (new_mode == PM_WRITE && needs_check) {
DMERR("%s: unable to switch pool to write mode until repaired.",
dm_device_name(pool->pool_md)); if (old_mode != new_mode)
new_mode = old_mode; else
new_mode = PM_READ_ONLY;
} /* * If we were in PM_FAIL mode, rollback of metadata failed. We're * not going to recover without a thin_repair. So we never let the * pool move out of the old mode.
*/ if (old_mode == PM_FAIL)
new_mode = old_mode;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.