DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, "A percentage of time allocated for copy on write");
/* * The block size of the device holding pool data must be * between 64KB and 1GB.
*/ #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
/* * Device id is restricted to 24 bits.
*/ #define MAX_DEV_ID ((1 << 24) - 1)
/* * How do we handle breaking sharing of data blocks? * ================================================= * * We use a standard copy-on-write btree to store the mappings for the * devices (note I'm talking about copy-on-write of the metadata here, not * the data). When you take an internal snapshot you clone the root node * of the origin btree. After this there is no concept of an origin or a * snapshot. They are just two device trees that happen to point to the * same data blocks. * * When we get a write in we decide if it's to a shared data block using * some timestamp magic. If it is, we have to break sharing. * * Let's say we write to a shared block in what was the origin. The * steps are: * * i) plug io further to this physical block. (see bio_prison code). * * ii) quiesce any read io to that shared data block. Obviously * including all devices that share this block. (see dm_deferred_set code) * * iii) copy the data block to a newly allocate block. This step can be * missed out if the io covers the block. (schedule_copy). * * iv) insert the new mapping into the origin's btree * (process_prepared_mapping). This act of inserting breaks some * sharing of btree nodes between the two devices. Breaking sharing only * effects the btree of that specific device. Btrees for the other * devices that share the block never change. The btree for the origin * device as it was after the last commit is untouched, ie. we're using * persistent data structures in the functional programming sense. * * v) unplug io to this physical block, including the io that triggered * the breaking of sharing. * * Steps (ii) and (iii) occur in parallel. * * The metadata _doesn't_ need to be committed before the io continues. We * get away with this because the io is always written to a _new_ block. * If there's a crash, then: * * - The origin mapping will point to the old origin block (the shared * one). This will contain the data as it was before the io that triggered * the breaking of sharing came in. * * - The snap mapping still points to the old block. As it would after * the commit. * * The downside of this scheme is the timestamp magic isn't perfect, and * will continue to think that data block in the snapshot device is shared * even after the write to the origin has broken sharing. I suspect data * blocks will typically be shared by many different devices, so we're * breaking sharing n + 1 times, rather than n, where n is the number of * devices that reference this data block. At the moment I think the * benefits far, far outweigh the disadvantages.
*/
/* * A pool device ties together a metadata device and a data device. It * also provides the interface for creating and destroying internal * devices.
*/ struct dm_thin_new_mapping;
/* * The pool runs in various modes. Ordered in degraded order for comparisons.
*/ enum pool_mode {
PM_WRITE, /* metadata may be changed */
PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
/* * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
*/
PM_OUT_OF_METADATA_SPACE,
PM_READ_ONLY, /* metadata may not be changed */
if (mode == PM_OUT_OF_DATA_SPACE) { if (!pool->pf.error_if_no_space)
extra_desc = " (queue IO)"; else
extra_desc = " (error IO)";
}
dm_table_event(pool->ti->table);
DMINFO("%s: switching pool to %s%s mode",
dm_device_name(pool->pool_md),
descs[(int)mode], extra_desc ? : "");
}
/* * Target context for a pool.
*/ struct pool_c { struct dm_target *ti; struct pool *pool; struct dm_dev *data_dev; struct dm_dev *metadata_dev;
dm_block_t low_water_blocks; struct pool_features requested_pf; /* Features requested during table load */ struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
};
/* * Ensures the thin is not destroyed until the worker has finished * iterating the active_thins list.
*/
refcount_t refcount; struct completion can_destroy;
};
staticvoid end_discard(struct discard_op *op, int r)
{ if (op->bio) { /* * Even if one of the calls to issue_discard failed, we * need to wait for the chain to complete.
*/
bio_chain(op->bio, op->parent_bio);
op->bio->bi_opf = REQ_OP_DISCARD;
submit_bio(op->bio);
}
blk_finish_plug(&op->plug);
/* * Even if r is set, there could be sub discards in flight that we * need to wait for.
*/ if (r && !op->parent_bio->bi_status)
op->parent_bio->bi_status = errno_to_blk_status(r);
bio_endio(op->parent_bio);
}
/* * wake_worker() is used when new work is queued and when pool_resume is * ready to continue deferred IO processing.
*/ staticvoid wake_worker(struct pool *pool)
{
queue_work(pool->wq, &pool->worker);
}
staticint bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio, struct dm_bio_prison_cell **cell_result)
{ int r; struct dm_bio_prison_cell *cell_prealloc;
/* * Allocate a cell from the prison's mempool. * This might block but it can't fail.
*/
cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result); if (r) { /* * We reused an old cell; we can get rid of * the new one.
*/
dm_bio_prison_free_cell(pool->prison, cell_prealloc);
}
/* * A global list of pools that uses a struct mapped_device as a key.
*/ staticstruct dm_thin_pool_table { struct mutex mutex; struct list_head pools;
} dm_thin_pool_table;
staticvoid error_retry_list(struct pool *pool)
{
error_retry_list_with_code(pool, get_pool_io_error_code(pool));
}
/* * This section of code contains the logic for processing a thin device's IO. * Much of the code depends on pool object resources (lists, workqueues, etc) * but most is exclusively called from the thin target rather than the thin-pool * target.
*/
static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
{ struct pool *pool = tc->pool;
sector_t block_nr = bio->bi_iter.bi_sector;
if (block_size_is_power_of_two(pool))
block_nr >>= pool->sectors_per_block_shift; else
(void) sector_div(block_nr, pool->sectors_per_block);
return block_nr;
}
/* * Returns the _complete_ blocks that this bio covers.
*/ staticvoid get_bio_block_range(struct thin_c *tc, struct bio *bio,
dm_block_t *begin, dm_block_t *end)
{ struct pool *pool = tc->pool;
sector_t b = bio->bi_iter.bi_sector;
sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
b += pool->sectors_per_block - 1ull; /* so we round up */
if (block_size_is_power_of_two(pool)) {
b >>= pool->sectors_per_block_shift;
e >>= pool->sectors_per_block_shift;
} else {
(void) sector_div(b, pool->sectors_per_block);
(void) sector_div(e, pool->sectors_per_block);
}
if (e < b) { /* Can happen if the bio is within a single block. */
e = b;
}
*begin = b;
*end = e;
}
staticvoid remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
{ struct pool *pool = tc->pool;
sector_t bi_sector = bio->bi_iter.bi_sector;
staticvoid inc_all_io_entry(struct pool *pool, struct bio *bio)
{ struct dm_thin_endio_hook *h;
if (bio_op(bio) == REQ_OP_DISCARD) return;
h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
}
staticvoid issue(struct thin_c *tc, struct bio *bio)
{ struct pool *pool = tc->pool;
if (!bio_triggers_commit(tc, bio)) {
dm_submit_bio_remap(bio, NULL); return;
}
/* * Complete bio with an error if earlier I/O caused changes to * the metadata that can't be committed e.g, due to I/O errors * on the metadata device.
*/ if (dm_thin_aborted_changes(tc->td)) {
bio_io_error(bio); return;
}
/* * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios().
*/
spin_lock_irq(&pool->lock);
bio_list_add(&pool->deferred_flush_bios, bio);
spin_unlock_irq(&pool->lock);
}
/* * Track quiescing, copying and zeroing preparation actions. When this * counter hits zero the block is prepared and can be inserted into the * btree.
*/
atomic_t prepare_actions;
/* * If the bio covers the whole area of a block then we can avoid * zeroing or copying. Instead this bio is hooked. The bio will * still be in the cell, so care has to be taken to avoid issuing * the bio twice.
*/ struct bio *bio;
bio_end_io_t *saved_bi_end_io;
};
staticvoid __complete_mapping_preparation(struct dm_thin_new_mapping *m)
{ struct pool *pool = m->tc->pool;
if (atomic_dec_and_test(&m->prepare_actions)) {
list_add_tail(&m->list, &pool->prepared_mappings);
wake_worker(pool);
}
}
/* * This sends the bios in the cell, except the original holder, back * to the deferred_bios list.
*/ staticvoid cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{ struct pool *pool = tc->pool; unsignedlong flags; struct bio_list bios;
while ((bio = bio_list_pop(&cell->bios))) { if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
bio_list_add(&info->defer_bios, bio); else {
inc_all_io_entry(info->tc->pool, bio);
/* * We can't issue the bios with the bio prison lock * held, so we add them to a list to issue on * return from this function.
*/
bio_list_add(&info->issue_bios, bio);
}
}
}
/* * We have to be careful to inc any bios we're about to issue * before the cell is released, and avoid a race with new bios * being added to the cell.
*/
cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
&info, cell);
while ((bio = bio_list_pop(&info.defer_bios)))
thin_defer_bio(tc, bio);
while ((bio = bio_list_pop(&info.issue_bios)))
remap_and_issue(info.tc, bio, block);
}
staticvoid complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
{ struct pool *pool = tc->pool;
/* * If the bio has the REQ_FUA flag set we must commit the metadata * before signaling its completion.
*/ if (!bio_triggers_commit(tc, bio)) {
bio_endio(bio); return;
}
/* * Complete bio with an error if earlier I/O caused changes to the * metadata that can't be committed, e.g, due to I/O errors on the * metadata device.
*/ if (dm_thin_aborted_changes(tc->td)) {
bio_io_error(bio); return;
}
/* * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios().
*/
spin_lock_irq(&pool->lock);
bio_list_add(&pool->deferred_flush_completions, bio);
spin_unlock_irq(&pool->lock);
}
staticvoid process_prepared_mapping(struct dm_thin_new_mapping *m)
{ struct thin_c *tc = m->tc; struct pool *pool = tc->pool; struct bio *bio = m->bio; int r;
if (m->status) {
cell_error(pool, m->cell); goto out;
}
/* * Commit the prepared block into the mapping btree. * Any I/O for this block arriving after this point will get * remapped to it directly.
*/
r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block); if (r) {
metadata_operation_failed(pool, "dm_thin_insert_block", r);
cell_error(pool, m->cell); goto out;
}
/* * Release any bios held while the block was being provisioned. * If we are processing a write bio that completely covers the block, * we already processed it so can ignore it now when processing * the bios in the cell.
*/ if (bio) {
inc_remap_and_issue_cell(tc, m->cell, m->data_block);
complete_overwrite_bio(tc, bio);
} else {
inc_all_io_entry(tc->pool, m->cell->holder);
remap_and_issue(tc, m->cell->holder, m->data_block);
inc_remap_and_issue_cell(tc, m->cell, m->data_block);
}
staticvoid passdown_double_checking_shared_status(struct dm_thin_new_mapping *m, struct bio *discard_parent)
{ /* * We've already unmapped this range of blocks, but before we * passdown we have to check that these blocks are now unused.
*/ int r = 0; bool shared = true; struct thin_c *tc = m->tc; struct pool *pool = tc->pool;
dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin; struct discard_op op;
begin_discard(&op, tc, discard_parent); while (b != end) { /* find start of unmapped run */ for (; b < end; b++) {
r = dm_pool_block_is_shared(pool->pmd, b, &shared); if (r) goto out;
if (!shared) break;
}
if (b == end) break;
/* find end of run */ for (e = b + 1; e != end; e++) {
r = dm_pool_block_is_shared(pool->pmd, e, &shared); if (r) goto out;
staticvoid passdown_endio(struct bio *bio)
{ /* * It doesn't matter if the passdown discard failed, we still want * to unmap (we ignore err).
*/
queue_passdown_pt2(bio->bi_private);
bio_put(bio);
}
staticvoid process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
{ int r; struct thin_c *tc = m->tc; struct pool *pool = tc->pool; struct bio *discard_parent;
dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
/* * Only this thread allocates blocks, so we can be sure that the * newly unmapped blocks will not be allocated before the end of * the function.
*/
r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end); if (r) {
metadata_operation_failed(pool, "dm_thin_remove_range", r);
bio_io_error(m->bio);
cell_defer_no_holder(tc, m->cell);
mempool_free(m, &pool->mapping_pool); return;
}
/* * Increment the unmapped blocks. This prevents a race between the * passdown io and reallocation of freed blocks.
*/
r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end); if (r) {
metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
bio_io_error(m->bio);
cell_defer_no_holder(tc, m->cell);
mempool_free(m, &pool->mapping_pool); return;
}
staticvoid process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
{ int r; struct thin_c *tc = m->tc; struct pool *pool = tc->pool;
/* * The passdown has completed, so now we can decrement all those * unmapped blocks.
*/
r = dm_pool_dec_data_range(pool->pmd, m->data_block,
m->data_block + (m->virt_end - m->virt_begin)); if (r) {
metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
bio_io_error(m->bio);
} else
bio_endio(m->bio);
/* * quiesce action + copy action + an extra reference held for the * duration of this function (we may need to inc later for a * partial zero).
*/
atomic_set(&m->prepare_actions, 3);
if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
complete_mapping_preparation(m); /* already quiesced */
/* * IO to pool_dev remaps to the pool target's data_dev. * * If the whole block of data is being overwritten, we can issue the * bio immediately. Otherwise we use kcopyd to clone the data first.
*/ if (io_overwrites_block(pool, bio))
remap_and_issue_overwrite(tc, bio, data_dest, m); else { struct dm_io_region from, to;
/* * Do we need to zero a tail region?
*/ if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
atomic_inc(&m->prepare_actions);
ll_zero(tc, m,
data_dest * pool->sectors_per_block + len,
(data_dest + 1) * pool->sectors_per_block);
}
}
complete_mapping_preparation(m); /* drop our ref */
}
atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
m->tc = tc;
m->virt_begin = virt_block;
m->virt_end = virt_block + 1u;
m->data_block = data_block;
m->cell = cell;
/* * If the whole block of data is being overwritten or we are not * zeroing pre-existing data, we can issue the bio immediately. * Otherwise we use kcopyd to zero the data first.
*/ if (pool->pf.zero_new_blocks) { if (io_overwrites_block(pool, bio))
remap_and_issue_overwrite(tc, bio, data_block, m); else {
ll_zero(tc, m, data_block * pool->sectors_per_block,
(data_block + 1) * pool->sectors_per_block);
}
} else
process_prepared_mapping(m);
}
staticbool is_read_only(struct pool *pool)
{ return is_read_only_pool_mode(get_pool_mode(pool));
}
staticvoid check_for_metadata_space(struct pool *pool)
{ int r; constchar *ooms_reason = NULL;
dm_block_t nr_free;
r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free); if (r)
ooms_reason = "Could not get free metadata blocks"; elseif (!nr_free)
ooms_reason = "No free metadata blocks";
staticvoid check_for_data_space(struct pool *pool)
{ int r;
dm_block_t nr_free;
if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE) return;
r = dm_pool_get_free_block_count(pool->pmd, &nr_free); if (r) return;
if (nr_free) {
set_pool_mode(pool, PM_WRITE);
requeue_bios(pool);
}
}
/* * A non-zero return indicates read_only or fail_io mode. * Many callers don't care about the return value.
*/ staticint commit(struct pool *pool)
{ int r;
if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) return -EINVAL;
r = dm_pool_commit_metadata(pool->pmd); if (r)
metadata_operation_failed(pool, "dm_pool_commit_metadata", r); else {
check_for_metadata_space(pool);
check_for_data_space(pool);
}
return r;
}
staticvoid check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
{ if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
DMWARN("%s: reached low water mark for data device: sending event.",
dm_device_name(pool->pool_md));
spin_lock_irq(&pool->lock);
pool->low_water_triggered = true;
spin_unlock_irq(&pool->lock);
dm_table_event(pool->ti->table);
}
}
staticint alloc_data_block(struct thin_c *tc, dm_block_t *result)
{ int r;
dm_block_t free_blocks; struct pool *pool = tc->pool;
if (WARN_ON(get_pool_mode(pool) != PM_WRITE)) return -EINVAL;
r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); if (r) {
metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); return r;
}
check_low_water_mark(pool, free_blocks);
if (!free_blocks) { /* * Try to commit to see if that will free up some * more space.
*/
r = commit(pool); if (r) return r;
r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); if (r) {
metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); return r;
}
if (!free_blocks) {
set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); return -ENOSPC;
}
}
r = dm_pool_alloc_data_block(pool->pmd, result); if (r) { if (r == -ENOSPC)
set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); else
metadata_operation_failed(pool, "dm_pool_alloc_data_block", r); return r;
}
r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks); if (r) {
metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r); return r;
}
if (!free_blocks) { /* Let's commit before we use up the metadata reserve. */
r = commit(pool); if (r) return r;
}
return 0;
}
/* * If we have run out of space, queue bios until the device is * resumed, presumably after having been reloaded with more space.
*/ staticvoid retry_on_resume(struct bio *bio)
{ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct thin_c *tc = h->tc;
static blk_status_t should_error_unserviceable_bio(struct pool *pool)
{ enum pool_mode m = get_pool_mode(pool);
switch (m) { case PM_WRITE: /* Shouldn't get here */
DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); return BLK_STS_IOERR;
case PM_OUT_OF_DATA_SPACE: return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
case PM_OUT_OF_METADATA_SPACE: case PM_READ_ONLY: case PM_FAIL: return BLK_STS_IOERR; default: /* Shouldn't get here */
DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); return BLK_STS_IOERR;
}
}
staticvoid handle_unserviceable_bio(struct pool *pool, struct bio *bio)
{
blk_status_t error = should_error_unserviceable_bio(pool);
/* * We don't need to lock the data blocks, since there's no * passdown. We only lock data blocks for allocation and breaking sharing.
*/
m->tc = tc;
m->virt_begin = virt_cell->key.block_begin;
m->virt_end = virt_cell->key.block_end;
m->cell = virt_cell;
m->bio = virt_cell->holder;
if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
pool->process_prepared_discard(m);
}
staticvoid break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end, struct bio *bio)
{ struct pool *pool = tc->pool;
while (begin != end) {
r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
&data_begin, &maybe_shared); if (r) { /* * Silently fail, letting any mappings we've * created complete.
*/ break;
}
data_end = data_begin + (virt_end - virt_begin);
/* * Make sure the data region obeys the bio prison restrictions.
*/ while (data_begin < data_end) {
r = ensure_next_mapping(pool); if (r) return; /* we did our best */
/* This key is certainly within range given the above splitting */
(void) build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key); if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) { /* contention, we'll give up with this range */
data_begin += len; continue;
}
/* * IO may still be going to the destination block. We must * quiesce before we can do the removal.
*/
m = get_next_mapping(pool);
m->tc = tc;
m->maybe_shared = maybe_shared;
m->virt_begin = virt_begin;
m->virt_end = virt_begin + len;
m->data_block = data_begin;
m->cell = data_cell;
m->bio = bio;
/* * The parent bio must not complete before sub discard bios are * chained to it (see end_discard's bio_chain)! * * This per-mapping bi_remaining increment is paired with * the implicit decrement that occurs via bio_endio() in * end_discard().
*/
bio_inc_remaining(bio); if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
pool->process_prepared_discard(m);
/* * The virt_cell will only get freed once the origin bio completes. * This means it will remain locked while all the individual * passdown bios are in flight.
*/
h->cell = virt_cell;
break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
/* * We complete the bio now, knowing that the bi_remaining field * will prevent completion until the sub range discards have * completed.
*/
bio_endio(bio);
}
get_bio_block_range(tc, bio, &begin, &end); if (begin == end) { /* * The discard covers less than a block.
*/
bio_endio(bio); return;
}
if (unlikely(!build_key(tc->td, VIRTUAL, begin, end, &virt_key))) {
DMERR_LIMIT("Discard doesn't respect bio prison limits");
bio_endio(bio); return;
}
if (bio_detain(tc->pool, &virt_key, bio, &virt_cell)) { /* * Potential starvation issue: We're relying on the * fs/application being well behaved, and not trying to * send IO to a region at the same time as discarding it. * If they do this persistently then it's possible this * cell will never be granted.
*/ return;
}
tc->pool->process_discard_cell(tc, virt_cell);
}
staticvoid break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, struct dm_cell_key *key, struct dm_thin_lookup_result *lookup_result, struct dm_bio_prison_cell *cell)
{ int r;
dm_block_t data_block; struct pool *pool = tc->pool;
r = alloc_data_block(tc, &data_block); switch (r) { case 0:
schedule_internal_copy(tc, block, lookup_result->block,
data_block, cell, bio); break;
case -ENOSPC:
retry_bios_on_resume(pool, cell); break;
/* * If cell is already occupied, then sharing is already in the process * of being broken so we have nothing further to do here.
*/
build_data_key(tc->td, lookup_result->block, &key); if (bio_detain(pool, &key, bio, &data_cell)) {
cell_defer_no_holder(tc, virt_cell); return;
}
staticvoid provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, struct dm_bio_prison_cell *cell)
{ int r;
dm_block_t data_block; struct pool *pool = tc->pool;
/* * Remap empty bios (flushes) immediately, without provisioning.
*/ if (!bio->bi_iter.bi_size) {
inc_all_io_entry(pool, bio);
cell_defer_no_holder(tc, cell);
remap_and_issue(tc, bio, 0); return;
}
/* * Fill read bios with zeroes and complete them immediately.
*/ if (bio_data_dir(bio) == READ) {
zero_fill_bio(bio);
cell_defer_no_holder(tc, cell);
bio_endio(bio); return;
}
r = alloc_data_block(tc, &data_block); switch (r) { case 0: if (tc->origin_dev)
schedule_external_copy(tc, block, data_block, cell, bio); else
schedule_zero(tc, block, data_block, cell, bio); break;
case -ENOSPC:
retry_bios_on_resume(pool, cell); break;
/* * If cell is already occupied, then the block is already * being provisioned so we have nothing further to do here.
*/
build_virtual_key(tc->td, block, &key); if (bio_detain(pool, &key, bio, &cell)) return;
process_cell(tc, cell);
}
staticvoid __process_bio_read_only(struct thin_c *tc, struct bio *bio, struct dm_bio_prison_cell *cell)
{ int r; int rw = bio_data_dir(bio);
dm_block_t block = get_bio_block(tc, bio); struct dm_thin_lookup_result lookup_result;
r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { case 0: if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
handle_unserviceable_bio(tc->pool, bio); if (cell)
cell_defer_no_holder(tc, cell);
} else {
inc_all_io_entry(tc->pool, bio);
remap_and_issue(tc, bio, lookup_result.block); if (cell)
inc_remap_and_issue_cell(tc, cell, lookup_result.block);
} break;
case -ENODATA: if (cell)
cell_defer_no_holder(tc, cell); if (rw != READ) {
handle_unserviceable_bio(tc->pool, bio); break;
}
if (tc->origin_dev) {
inc_all_io_entry(tc->pool, bio);
remap_to_origin_and_issue(tc, bio); break;
}
/* * FIXME: should we also commit due to size of transaction, measured in * metadata blocks?
*/ staticint need_commit_due_to_time(struct pool *pool)
{ return !time_in_range(jiffies, pool->last_commit_jiffies,
pool->last_commit_jiffies + COMMIT_PERIOD);
}
blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { /* * If we've got no free new_mapping structs, and processing * this bio might require one, we pause until there are some * prepared mappings to process.
*/ if (ensure_next_mapping(pool)) {
spin_lock_irq(&tc->lock);
bio_list_add(&tc->deferred_bio_list, bio);
bio_list_merge(&tc->deferred_bio_list, &bios);
spin_unlock_irq(&tc->lock); break;
}
if (bio_op(bio) == REQ_OP_DISCARD)
pool->process_discard(tc, bio); else
pool->process_bio(tc, bio);
for (i = 0; i < count; i++) {
cell = pool->cell_sort_array[i];
BUG_ON(!cell->holder);
/* * If we've got no free new_mapping structs, and processing * this bio might require one, we pause until there are some * prepared mappings to process.
*/ if (ensure_next_mapping(pool)) { for (j = i; j < count; j++)
list_add(&pool->cell_sort_array[j]->user_list, &cells);
/* * We can't hold rcu_read_lock() around code that can block. So we * find a thin with the rcu lock held; bump a refcount; then drop * the lock.
*/ staticstruct thin_c *get_first_thin(struct pool *pool)
{ struct thin_c *tc = NULL;
/* * If there are any deferred flush bios, we must commit the metadata * before issuing them or signaling their completion.
*/
bio_list_init(&bios);
bio_list_init(&bio_completions);
if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) return;
if (commit(pool)) {
bio_list_merge(&bios, &bio_completions);
while ((bio = bio_list_pop(&bios)))
bio_io_error(bio); return;
}
pool->last_commit_jiffies = jiffies;
while ((bio = bio_list_pop(&bio_completions)))
bio_endio(bio);
while ((bio = bio_list_pop(&bios))) { /* * The data device was flushed as part of metadata commit, * so complete redundant flushes immediately.
*/ if (bio->bi_opf & REQ_PREFLUSH)
bio_endio(bio); else
dm_submit_bio_remap(bio, NULL);
}
}
staticvoid do_worker(struct work_struct *ws)
{ struct pool *pool = container_of(ws, struct pool, worker);
/* * We want to commit periodically so that not too much * unwritten data builds up.
*/ staticvoid do_waker(struct work_struct *ws)
{ struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
/* * We're holding onto IO to allow userland time to react. After the * timeout either the pool will have been resized (and thus back in * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
*/ staticvoid do_no_space_timeout(struct work_struct *ws)
{ struct pool *pool = container_of(to_delayed_work(ws), struct pool,
no_space_timeout);
/* * Never allow the pool to transition to PM_WRITE mode if user * intervention is required to verify metadata and data consistency.
*/ if (new_mode == PM_WRITE && needs_check) {
DMERR("%s: unable to switch pool to write mode until repaired.",
dm_device_name(pool->pool_md)); if (old_mode != new_mode)
new_mode = old_mode; else
new_mode = PM_READ_ONLY;
} /* * If we were in PM_FAIL mode, rollback of metadata failed. We're * not going to recover without a thin_repair. So we never let the * pool move out of the old mode.
*/ if (old_mode == PM_FAIL)
new_mode = old_mode;
case PM_OUT_OF_DATA_SPACE: /* * Ideally we'd never hit this state; the low water mark * would trigger userland to extend the pool before we * completely run out of data space. However, many small * IOs to unprovisioned space can consume data space at an * alarming rate. Adjust your low water mark if you're * frequently seeing this mode.
*/
pool->out_of_data_space = true;
pool->process_bio = process_bio_read_only;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell_read_only;
pool->process_prepared_mapping = process_prepared_mapping;
set_discard_callbacks(pool);
if (!pool->pf.error_if_no_space && no_space_timeout)
queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout); break;
pool->pf.mode = new_mode; /* * The pool mode may have changed, sync it so bind_control_target() * doesn't cause an unexpected mode transition on resume.
*/
pt->adjusted_pf.mode = new_mode;
if (old_mode != new_mode)
notify_of_pool_mode_change(pool);
}
staticvoid abort_transaction(struct pool *pool)
{ constchar *dev_name = dm_device_name(pool->pool_md);
DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); if (dm_pool_abort_metadata(pool->pmd)) {
DMERR("%s: failed to abort metadata transaction", dev_name);
set_pool_mode(pool, PM_FAIL);
}
if (dm_pool_metadata_set_needs_check(pool->pmd)) {
DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
set_pool_mode(pool, PM_FAIL);
}
}
staticvoid metadata_operation_failed(struct pool *pool, constchar *op, int r)
{
DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
dm_device_name(pool->pool_md), op, r);
/* * Called only while mapping a thin bio to hand it over to the workqueue.
*/ staticvoid thin_defer_bio(struct thin_c *tc, struct bio *bio)
{ struct pool *pool = tc->pool;
/* * We must hold the virtual cell before doing the lookup, otherwise * there's a race with discard.
*/
build_virtual_key(tc->td, block, &key); if (bio_detain(tc->pool, &key, bio, &virt_cell)) return DM_MAPIO_SUBMITTED;
r = dm_thin_find_block(td, block, 0, &result);
/* * Note that we defer readahead too.
*/ switch (r) { case 0: if (unlikely(result.shared)) { /* * We have a race condition here between the * result.shared value returned by the lookup and * snapshot creation, which may cause new * sharing. * * To avoid this always quiesce the origin before * taking the snap. You want to do this anyway to * ensure a consistent application view * (i.e. lockfs). * * More distant ancestors are irrelevant. The * shared flag will be set in their case.
*/
thin_defer_cell(tc, virt_cell); return DM_MAPIO_SUBMITTED;
}
case -ENODATA: case -EWOULDBLOCK:
thin_defer_cell(tc, virt_cell); return DM_MAPIO_SUBMITTED;
default: /* * Must always call bio_io_error on failure. * dm_thin_find_block can fail with -EINVAL if the * pool is switched to fail-io mode.
*/
bio_io_error(bio);
cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED;
}
}
staticvoid requeue_bios(struct pool *pool)
{ struct thin_c *tc;
/* *-------------------------------------------------------------- * Binding of control targets to a pool object *--------------------------------------------------------------
*/ staticbool is_factor(sector_t block_size, uint32_t n)
{ return !sector_div(block_size, n);
}
/* * If discard_passdown was enabled verify that the data device * supports discards. Disable discard_passdown if not.
*/ staticvoid disable_discard_passdown_if_not_supported(struct pool_c *pt)
{ struct pool *pool = pt->pool; struct block_device *data_bdev = pt->data_dev->bdev; struct queue_limits *data_limits = bdev_limits(data_bdev); constchar *reason = NULL;
if (!pt->adjusted_pf.discard_passdown) return;
if (!bdev_max_discard_sectors(pt->data_dev->bdev))
reason = "discard unsupported";
elseif (data_limits->max_discard_sectors < pool->sectors_per_block)
reason = "max discard sectors smaller than a block";
/* * We want to make sure that a pool in PM_FAIL mode is never upgraded.
*/ enum pool_mode old_mode = get_pool_mode(pool); enum pool_mode new_mode = pt->adjusted_pf.mode;
/* * Don't change the pool's mode until set_pool_mode() below. * Otherwise the pool's process_* function pointers may * not match the desired pool mode.
*/
pt->adjusted_pf.mode = old_mode;
/* * Create singlethreaded workqueue that will service all devices * that use this metadata.
*/
pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); if (!pool->wq) {
*error = "Error creating pool's workqueue";
err_p = ERR_PTR(-ENOMEM); goto bad_wq;
}
staticvoid __pool_inc(struct pool *pool)
{
BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
pool->ref_count++;
}
staticvoid __pool_dec(struct pool *pool)
{
BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
BUG_ON(!pool->ref_count); if (!--pool->ref_count)
__pool_destroy(pool);
}
staticstruct pool *__pool_find(struct mapped_device *pool_md, struct block_device *metadata_dev, struct block_device *data_dev, unsignedlong block_size, int read_only, char **error, int *created)
{ struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
if (pool) { if (pool->pool_md != pool_md) {
*error = "metadata device already in use by a pool"; return ERR_PTR(-EBUSY);
} if (pool->data_dev != data_dev) {
*error = "data device already in use by a pool"; return ERR_PTR(-EBUSY);
}
__pool_inc(pool);
} else {
pool = __pool_table_lookup(pool_md); if (pool) { if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {
*error = "different pool cannot replace a pool"; return ERR_PTR(-EINVAL);
}
__pool_inc(pool);
else {
ti->error = "Unrecognised pool feature requested";
r = -EINVAL; break;
}
}
return r;
}
staticvoid metadata_low_callback(void *context)
{ struct pool *pool = context;
DMWARN("%s: reached low water mark for metadata device: sending event.",
dm_device_name(pool->pool_md));
dm_table_event(pool->ti->table);
}
/* * We need to flush the data device **before** committing the metadata. * * This ensures that the data blocks of any newly inserted mappings are * properly written to non-volatile storage and won't be lost in case of a * crash. * * Failure to do so can result in data corruption in the case of internal or * external snapshots and in the case of newly provisioned blocks, when block * zeroing is enabled.
*/ staticint metadata_pre_commit_callback(void *context)
{ struct pool *pool = context;
if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
bdev, THIN_METADATA_MAX_SECTORS);
}
/* * When a metadata threshold is crossed a dm event is triggered, and * userland should respond by growing the metadata device. We could let * userland set the threshold, like we do with the data threshold, but I'm * not sure they know enough to do this well.
*/ static dm_block_t calc_metadata_threshold(struct pool_c *pt)
{ /* * 4M is ample for all ops with the possible exception of thin * device deletion which is harmless if it fails (just retry the * delete after you've grown the device).
*/
dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
/* * thin-pool <metadata dev> <data dev> * <data block size (sectors)> * <low water mark (blocks)> * [<#feature args> [<arg>]*] * * Optional feature arguments are: * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. * ignore_discard: disable discard * no_discard_passdown: don't pass discards down to the data device * read_only: Don't allow any changes to be made to the pool metadata. * error_if_no_space: error IOs, instead of queueing, if no space.
*/ staticint pool_ctr(struct dm_target *ti, unsignedint argc, char **argv)
{ int r, pool_created = 0; struct pool_c *pt; struct pool *pool; struct pool_features pf; struct dm_arg_set as; struct dm_dev *data_dev; unsignedlong block_size;
dm_block_t low_water_blocks; struct dm_dev *metadata_dev;
blk_mode_t metadata_mode;
/* * FIXME Remove validation from scope of lock.
*/
mutex_lock(&dm_thin_pool_table.mutex);
if (argc < 4) {
ti->error = "Invalid argument count";
r = -EINVAL; goto out_unlock;
}
as.argc = argc;
as.argv = argv;
/* make sure metadata and data are different devices */ if (!strcmp(argv[0], argv[1])) {
ti->error = "Error setting metadata or data device";
r = -EINVAL; goto out_unlock;
}
/* * Set default pool features.
*/
pool_features_init(&pf);
dm_consume_args(&as, 4);
r = parse_pool_features(&as, &pf, ti); if (r) goto out_unlock;
if (kstrtoull(argv[3], 10, (unsignedlonglong *)&low_water_blocks)) {
ti->error = "Invalid low water mark";
r = -EINVAL; goto out;
}
pt = kzalloc(sizeof(*pt), GFP_KERNEL); if (!pt) {
r = -ENOMEM; goto out;
}
pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); if (IS_ERR(pool)) {
r = PTR_ERR(pool); goto out_free_pt;
}
/* * 'pool_created' reflects whether this is the first table load. * Top level discard support is not allowed to be changed after * initial load. This would require a pool reload to trigger thin * device changes.
*/ if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
ti->error = "Discard support cannot be disabled once enabled";
r = -EINVAL; goto out_flags_changed;
}
/* * Only need to enable discards if the pool should pass * them down to the data device. The thin device's discard * processing will cause mappings to be removed from the btree.
*/ if (pf.discard_enabled && pf.discard_passdown) {
ti->num_discard_bios = 1; /* * Setting 'discards_supported' circumvents the normal * stacking of discard limits (this keeps the pool and * thin devices' discard limits consistent).
*/
ti->discards_supported = true;
ti->max_discard_granularity = true;
}
ti->private = pt;
r = dm_pool_register_metadata_threshold(pt->pool->pmd,
calc_metadata_threshold(pt),
metadata_low_callback,
pool); if (r) {
ti->error = "Error registering metadata threshold"; goto out_flags_changed;
}
staticint pool_map(struct dm_target *ti, struct bio *bio)
{ struct pool_c *pt = ti->private; struct pool *pool = pt->pool;
/* * As this is a singleton target, ti->begin is always zero.
*/
spin_lock_irq(&pool->lock);
bio_set_dev(bio, pt->data_dev->bdev);
spin_unlock_irq(&pool->lock);
r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); if (r) {
DMERR("%s: failed to retrieve data device size",
dm_device_name(pool->pool_md)); return r;
}
if (data_size < sb_data_size) {
DMERR("%s: pool target (%llu blocks) too small: expected %llu",
dm_device_name(pool->pool_md),
(unsignedlonglong)data_size, sb_data_size); return -EINVAL;
} elseif (data_size > sb_data_size) { if (dm_pool_metadata_needs_check(pool->pmd)) {
DMERR("%s: unable to grow the data device until repaired.",
dm_device_name(pool->pool_md)); return 0;
}
if (sb_data_size)
DMINFO("%s: growing the data device from %llu to %llu blocks",
dm_device_name(pool->pool_md),
sb_data_size, (unsignedlonglong)data_size);
r = dm_pool_resize_data_dev(pool->pmd, data_size); if (r) {
metadata_operation_failed(pool, "dm_pool_resize_data_dev", r); return r;
}
r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size); if (r) {
DMERR("%s: failed to retrieve metadata device size",
dm_device_name(pool->pool_md)); return r;
}
if (metadata_dev_size < sb_metadata_dev_size) {
DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
dm_device_name(pool->pool_md),
metadata_dev_size, sb_metadata_dev_size); return -EINVAL;
} elseif (metadata_dev_size > sb_metadata_dev_size) { if (dm_pool_metadata_needs_check(pool->pmd)) {
DMERR("%s: unable to grow the metadata device until repaired.",
dm_device_name(pool->pool_md)); return 0;
}
warn_if_metadata_device_too_big(pool->md_dev);
DMINFO("%s: growing the metadata device from %llu to %llu blocks",
dm_device_name(pool->pool_md),
sb_metadata_dev_size, metadata_dev_size);
if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
set_pool_mode(pool, PM_WRITE);
r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); if (r) {
metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r); return r;
}
*need_commit = true;
}
return 0;
}
/* * Retrieves the number of blocks of the data device from * the superblock and compares it to the actual device size, * thus resizing the data device in case it has grown. * * This both copes with opening preallocated data devices in the ctr * being followed by a resume * -and- * calling the resume method individually after userspace has * grown the data device in reaction to a table event.
*/ staticint pool_preresume(struct dm_target *ti)
{ int r; bool need_commit1, need_commit2; struct pool_c *pt = ti->private; struct pool *pool = pt->pool;
/* * Take control of the pool object.
*/
r = bind_control_target(pool, ti); if (r) goto out;
r = maybe_resize_data_dev(ti, &need_commit1); if (r) goto out;
r = maybe_resize_metadata_dev(ti, &need_commit2); if (r) goto out;
if (need_commit1 || need_commit2)
(void) commit(pool);
out: /* * When a thin-pool is PM_FAIL, it cannot be rebuilt if * bio is in deferred list. Therefore need to return 0 * to allow pool_resume() to flush IO.
*/ if (r && get_pool_mode(pool) == PM_FAIL)
r = 0;
return r;
}
staticvoid pool_suspend_active_thins(struct pool *pool)
{ struct thin_c *tc;
/* Suspend all active thin devices */
tc = get_first_thin(pool); while (tc) {
dm_internal_suspend_noflush(tc->thin_md);
tc = get_next_thin(pool, tc);
}
}
staticvoid pool_resume_active_thins(struct pool *pool)
{ struct thin_c *tc;
/* Resume all active thin devices */
tc = get_first_thin(pool); while (tc) {
dm_internal_resume(tc->thin_md);
tc = get_next_thin(pool, tc);
}
}
staticint check_arg_count(unsignedint argc, unsignedint args_required)
{ if (argc != args_required) {
DMWARN("Message received with %u arguments instead of %u.",
argc, args_required); return -EINVAL;
}
return 0;
}
staticint read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
{ if (!kstrtoull(arg, 10, (unsignedlonglong *)dev_id) &&
*dev_id <= MAX_DEV_ID) return 0;
if (warning)
DMWARN("Message received with invalid device id: %s", arg);
return -EINVAL;
}
staticint process_create_thin_mesg(unsignedint argc, char **argv, struct pool *pool)
{
dm_thin_id dev_id; int r;
r = check_arg_count(argc, 2); if (r) return r;
r = read_dev_id(argv[1], &dev_id, 1); if (r) return r;
r = dm_pool_create_thin(pool->pmd, dev_id); if (r) {
DMWARN("Creation of new thinly-provisioned device with id %s failed.",
argv[1]); return r;
}
return 0;
}
staticint process_create_snap_mesg(unsignedint argc, char **argv, struct pool *pool)
{
dm_thin_id dev_id;
dm_thin_id origin_dev_id; int r;
r = check_arg_count(argc, 3); if (r) return r;
r = read_dev_id(argv[1], &dev_id, 1); if (r) return r;
r = read_dev_id(argv[2], &origin_dev_id, 1); if (r) return r;
r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); if (r) {
DMWARN("Creation of new snapshot %s of device %s failed.",
argv[1], argv[2]); return r;
}
return 0;
}
staticint process_delete_mesg(unsignedint argc, char **argv, struct pool *pool)
{
dm_thin_id dev_id; int r;
r = check_arg_count(argc, 2); if (r) return r;
r = read_dev_id(argv[1], &dev_id, 1); if (r) return r;
r = dm_pool_delete_thin_device(pool->pmd, dev_id); if (r)
DMWARN("Deletion of thin device %s failed.", argv[1]);
return r;
}
staticint process_set_transaction_id_mesg(unsignedint argc, char **argv, struct pool *pool)
{
dm_thin_id old_id, new_id; int r;
r = check_arg_count(argc, 3); if (r) return r;
if (kstrtoull(argv[1], 10, (unsignedlonglong *)&old_id)) {
DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); return -EINVAL;
}
if (kstrtoull(argv[2], 10, (unsignedlonglong *)&new_id)) {
DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); return -EINVAL;
}
r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); if (r) {
DMWARN("Failed to change transaction id from %s to %s.",
argv[1], argv[2]); return r;
}
return 0;
}
staticint process_reserve_metadata_snap_mesg(unsignedint argc, char **argv, struct pool *pool)
{ int r;
r = check_arg_count(argc, 1); if (r) return r;
(void) commit(pool);
r = dm_pool_reserve_metadata_snap(pool->pmd); if (r)
DMWARN("reserve_metadata_snap message failed.");
return r;
}
staticint process_release_metadata_snap_mesg(unsignedint argc, char **argv, struct pool *pool)
{ int r;
r = check_arg_count(argc, 1); if (r) return r;
r = dm_pool_release_metadata_snap(pool->pmd); if (r)
DMWARN("release_metadata_snap message failed.");
if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
dm_device_name(pool->pool_md)); return -EOPNOTSUPP;
}
if (!strcasecmp(argv[0], "create_thin"))
r = process_create_thin_mesg(argc, argv, pool);
elseif (!strcasecmp(argv[0], "create_snap"))
r = process_create_snap_mesg(argc, argv, pool);
elseif (!strcasecmp(argv[0], "delete"))
r = process_delete_mesg(argc, argv, pool);
elseif (!strcasecmp(argv[0], "set_transaction_id"))
r = process_set_transaction_id_mesg(argc, argv, pool);
elseif (!strcasecmp(argv[0], "reserve_metadata_snap"))
r = process_reserve_metadata_snap_mesg(argc, argv, pool);
elseif (!strcasecmp(argv[0], "release_metadata_snap"))
r = process_release_metadata_snap_mesg(argc, argv, pool);
else
DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
/* * If max_sectors is smaller than pool->sectors_per_block adjust it * to the highest possible power-of-2 factor of pool->sectors_per_block. * This is especially beneficial when the pool's data device is a RAID * device that has a full stripe width that matches pool->sectors_per_block * -- because even though partial RAID stripe-sized IOs will be issued to a * single RAID stripe; when aggregated they will end on a full RAID stripe * boundary.. which avoids additional partial RAID stripe writes cascading
*/ if (limits->max_sectors < pool->sectors_per_block) { while (!is_factor(pool->sectors_per_block, limits->max_sectors)) { if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
limits->max_sectors--;
limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
}
}
/* * If the system-determined stacked limits are compatible with the * pool's blocksize (io_opt is a factor) do not override them.
*/ if (io_opt_sectors < pool->sectors_per_block ||
!is_factor(io_opt_sectors, pool->sectors_per_block)) { if (is_factor(pool->sectors_per_block, limits->max_sectors))
limits->io_min = limits->max_sectors << SECTOR_SHIFT; else
limits->io_min = pool->sectors_per_block << SECTOR_SHIFT;
limits->io_opt = pool->sectors_per_block << SECTOR_SHIFT;
}
/* * pt->adjusted_pf is a staging area for the actual features to use. * They get transferred to the live pool in bind_control_target() * called from pool_preresume().
*/
if (pt->adjusted_pf.discard_enabled) {
disable_discard_passdown_if_not_supported(pt); if (!pt->adjusted_pf.discard_passdown)
limits->max_hw_discard_sectors = 0; /* * The pool uses the same discard limits as the underlying data * device. DM core has already set this up.
*/
} else { /* * Must explicitly disallow stacking discard limits otherwise the * block layer will stack them if pool's data device has support.
*/
limits->discard_granularity = 0;
}
}
__pool_dec(tc->pool);
dm_pool_close_thin_device(tc->td);
dm_put_device(ti, tc->pool_dev); if (tc->origin_dev)
dm_put_device(ti, tc->origin_dev);
kfree(tc);
mutex_unlock(&dm_thin_pool_table.mutex);
}
/* * Thin target parameters: * * <pool_dev> <dev_id> [origin_dev] * * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) * dev_id: the internal device identifier * origin_dev: a device external to the pool that should act as the origin * * If the pool device has discards disabled, they get disabled for the thin * device as well.
*/ staticint thin_ctr(struct dm_target *ti, unsignedint argc, char **argv)
{ int r; struct thin_c *tc; struct dm_dev *pool_dev, *origin_dev; struct mapped_device *pool_md;
mutex_lock(&dm_thin_pool_table.mutex);
if (argc != 2 && argc != 3) {
ti->error = "Invalid argument count";
r = -EINVAL; goto out_unlock;
}
tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); if (!tc) {
ti->error = "Out of memory";
r = -ENOMEM; goto out_unlock;
}
tc->thin_md = dm_table_get_md(ti->table);
spin_lock_init(&tc->lock);
INIT_LIST_HEAD(&tc->deferred_cells);
bio_list_init(&tc->deferred_bio_list);
bio_list_init(&tc->retry_on_resume_list);
tc->sort_bio_list = RB_ROOT;
if (argc == 3) { if (!strcmp(argv[0], argv[2])) {
ti->error = "Error setting origin device";
r = -EINVAL; goto bad_origin_dev;
}
/* In case the pool supports discards, pass them on. */ if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true;
ti->num_discard_bios = 1;
ti->max_discard_granularity = true;
}
mutex_unlock(&dm_thin_pool_table.mutex);
spin_lock_irq(&tc->pool->lock); if (tc->pool->suspended) {
spin_unlock_irq(&tc->pool->lock);
mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
ti->error = "Unable to activate thin device while pool is suspended";
r = -EINVAL; goto bad;
}
refcount_set(&tc->refcount, 1);
init_completion(&tc->can_destroy);
list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
spin_unlock_irq(&tc->pool->lock); /* * This synchronize_rcu() call is needed here otherwise we risk a * wake_worker() call finding no bios to process (because the newly * added tc isn't yet visible). So this reduces latency since we * aren't then dependent on the periodic commit to wake_worker().
*/
synchronize_rcu();
if (get_pool_mode(tc->pool) == PM_FAIL) {
DMEMIT("Fail"); return;
}
if (!tc->td)
DMEMIT("-"); else { switch (type) { case STATUSTYPE_INFO:
r = dm_thin_get_mapped_count(tc->td, &mapped); if (r) {
DMERR("dm_thin_get_mapped_count returned %d", r); goto err;
}
r = dm_thin_get_highest_mapped_block(tc->td, &highest); if (r < 0) {
DMERR("dm_thin_get_highest_mapped_block returned %d", r); goto err;
}
/* * We can't call dm_pool_get_data_dev_size() since that blocks. So * we follow a more convoluted path through to the pool's target.
*/ if (!pool->ti) return 0; /* nothing is bound */
module_param_named(no_space_timeout, no_space_timeout_secs, uint, 0644);
MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.