/* *-------------------------------------------------------------------------- * As far as the metadata goes, there is: * * - A superblock in block zero, taking up fewer than 512 bytes for * atomic writes. * * - A space map managing the metadata blocks. * * - A space map managing the data blocks. * * - A btree mapping our internal thin dev ids onto struct disk_device_details. * * - A hierarchical btree, with 2 levels which effectively maps (thin * dev id, virtual block) -> block_time. Block time is a 64-bit * field holding the time in the low 24 bits, and block in the top 40 * bits. * * BTrees consist solely of btree_nodes, that fill a block. Some are * internal nodes, as such their values are a __le64 pointing to other * nodes. Leaf nodes can store data of any reasonable size (ie. much * smaller than the block size). The nodes consist of the header, * followed by an array of keys, followed by an array of values. We have * to binary search on the keys so they're all held together to help the * cpu cache. * * Space maps have 2 btrees: * * - One maps a uint64_t onto a struct index_entry. Which points to a * bitmap block, and has some details about how many free entries there * are etc. * * - The bitmap blocks have a header (for the checksum). Then the rest * of the block is pairs of bits. With the meaning being: * * 0 - ref count is 0 * 1 - ref count is 1 * 2 - ref count is 2 * 3 - ref count is higher than 2 * * - If the count is higher than 2 then the ref count is entered in a * second btree that directly maps the block_address to a uint32_t ref * count. * * The space map metadata variant doesn't have a bitmaps btree. Instead * it has one single blocks worth of index_entries. This avoids * recursive issues with the bitmap btree needing to allocate space in * order to insert. With a small data block size such as 64k the * metadata support data devices that are hundreds of terrabytes. * * The space maps allocate space linearly from front to back. Space that * is freed in a transaction is never recycled within that transaction. * To try and avoid fragmenting _free_ space the allocator always goes * back and fills in gaps. * * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks * from the block manager. *--------------------------------------------------------------------------
*/
/* * For btree insert: * 3 for btree insert + * 2 for btree lookup used within space map * For btree remove: * 2 for shadow spine + * 4 for rebalance 3 child node
*/ #define THIN_MAX_CONCURRENT_LOCKS 6
/* This should be plenty */ #define SPACE_MAP_ROOT_SIZE 128
/* * Little endian on-disk superblock and device details.
*/ struct thin_disk_superblock {
__le32 csum; /* Checksum of superblock except for this field. */
__le32 flags;
__le64 blocknr; /* This block number, dm_block_t. */
/* * Pre-commit callback. * * This allows the thin provisioning target to run a callback before * the metadata are committed.
*/
dm_pool_pre_commit_fn pre_commit_fn; void *pre_commit_context;
/* * We reserve a section of the metadata for commit overhead. * All reported space does *not* include this.
*/
dm_block_t metadata_reserve;
/* * Set if a transaction has to be aborted but the attempt to roll back * to the previous (good) transaction failed. The only pool metadata * operation possible in this state is the closing of the device.
*/ bool fail_io:1;
/* * Set once a thin-pool has been accessed through one of the interfaces * that imply the pool is in-service (e.g. thin devices created/deleted, * thin-pool message, metadata snapshots, etc).
*/ bool in_service:1;
/* * Reading the space map roots can fail, so we read it into these * buffers before the superblock is locked and updated.
*/
__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
};
/* * It's more efficient to call dm_sm_{inc,dec}_blocks as few times as * possible. 'with_runs' reads contiguous runs of blocks, and calls the * given sm function.
*/ typedefint (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t);
/* * Variant that is used for in-core only changes or code that * shouldn't put the pool in service on its own (e.g. commit).
*/ staticinlinevoid pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
__acquires(pmd->root_lock)
{
down_write(&pmd->root_lock);
}
features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; if (features) {
DMERR("could not access metadata due to unsupported optional features (%lx).",
(unsignedlong)features); return -EINVAL;
}
/* * Check for read-only metadata to skip the following RDWR checks.
*/ if (bdev_read_only(pmd->bdev)) return 0;
features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; if (features) {
DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
(unsignedlong)features); return -EINVAL;
}
r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
&sb_validator, &sblock); if (r < 0) {
DMERR("couldn't read superblock"); return r;
}
disk_super = dm_block_data(sblock);
/* Verify the data block size hasn't changed */ if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
DMERR("changing the data block size (from %u to %llu) is not supported",
le32_to_cpu(disk_super->data_block_size),
(unsignedlonglong)pmd->data_block_size);
r = -EINVAL; goto bad_unlock_sblock;
}
r = __check_incompat_features(disk_super, pmd); if (r < 0) goto bad_unlock_sblock;
pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root, sizeof(disk_super->data_space_map_root)); if (IS_ERR(pmd->data_sm)) {
DMERR("sm_disk_open failed");
r = PTR_ERR(pmd->data_sm);
pmd->data_sm = NULL; goto bad_cleanup_tm;
}
pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm); if (!pmd->nb_tm) {
DMERR("could not create non-blocking clone tm");
r = -ENOMEM; goto bad_cleanup_data_sm;
}
/* * For pool metadata opening process, root setting is redundant * because it will be set again in __begin_transaction(). But dm * pool aborting process really needs to get last transaction's * root to avoid accessing broken btree.
*/
pmd->root = le64_to_cpu(disk_super->data_mapping_root);
pmd->details_root = le64_to_cpu(disk_super->device_details_root);
/* * We re-read the superblock every time. Shouldn't need to do this * really.
*/
r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
&sb_validator, &sblock); if (r) return r;
/* * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
*/
BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
BUG_ON(!rwsem_is_locked(&pmd->root_lock));
if (unlikely(!pmd->in_service)) return 0;
if (pmd->pre_commit_fn) {
r = pmd->pre_commit_fn(pmd->pre_commit_context); if (r < 0) {
DMERR("pre-commit callback failed"); return r;
}
}
r = __write_changed_details(pmd); if (r < 0) return r;
r = dm_sm_commit(pmd->data_sm); if (r < 0) return r;
r = dm_tm_pre_commit(pmd->tm); if (r < 0) return r;
r = save_sm_roots(pmd); if (r < 0) return r;
r = superblock_lock(pmd, &sblock); if (r) return r;
r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total); if (r) {
DMERR("could not get size of metadata device");
pmd->metadata_reserve = max_blocks;
} else
pmd->metadata_reserve = min(max_blocks, div_u64(total, 10));
}
if (open_devices) {
DMERR("attempt to close pmd when %u device(s) are still open",
open_devices); return -EBUSY;
}
pmd_write_lock_in_core(pmd); if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) {
r = __commit_transaction(pmd); if (r < 0)
DMWARN("%s: __commit_transaction() failed, error = %d",
__func__, r);
}
pmd_write_unlock(pmd);
__destroy_persistent_data_objects(pmd, true);
kfree(pmd); return 0;
}
/* * __open_device: Returns @td corresponding to device with id @dev, * creating it if @create is set and incrementing @td->open_count. * On failure, @td is undefined.
*/ staticint __open_device(struct dm_pool_metadata *pmd,
dm_thin_id dev, int create, struct dm_thin_device **td)
{ int r, changed = 0; struct dm_thin_device *td2;
uint64_t key = dev; struct disk_device_details details_le;
/* * If the device is already open, return it.
*/
list_for_each_entry(td2, &pmd->thin_devices, list) if (td2->id == dev) { /* * May not create an already-open device.
*/ if (create) return -EEXIST;
td2->open_count++;
*td = td2; return 0;
}
/* * Check the device exists.
*/
r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
&key, &details_le); if (r) { if (r != -ENODATA || !create) return r;
r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
&key, NULL); if (!r) return -EEXIST;
/* * Create an empty btree for the mappings.
*/
r = dm_btree_empty(&pmd->bl_info, &dev_root); if (r) return r;
/* * Insert it into the main mapping tree.
*/
value = cpu_to_le64(dev_root);
__dm_bless_for_disk(&value);
r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); if (r) {
dm_btree_del(&pmd->bl_info, dev_root); return r;
}
r = __open_device(pmd, dev, 1, &td); if (r) {
dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
dm_btree_del(&pmd->bl_info, dev_root); return r;
}
__close_device(td);
return r;
}
int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
{ int r = -EINVAL;
pmd_write_lock(pmd); if (!pmd->fail_io)
r = __create_thin(pmd, dev);
pmd_write_unlock(pmd);
/* check this device is unused */
r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
&dev_key, NULL); if (!r) return -EEXIST;
/* find the mapping tree for the origin */
r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value); if (r) return r;
origin_root = le64_to_cpu(value);
/* clone the origin, an inc will do */
dm_tm_inc(pmd->tm, origin_root);
/* insert into the main mapping tree */
value = cpu_to_le64(origin_root);
__dm_bless_for_disk(&value);
key = dev;
r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); if (r) {
dm_tm_dec(pmd->tm, origin_root); return r;
}
pmd->time++;
r = __open_device(pmd, dev, 1, &td); if (r) goto bad;
r = __set_snapshot_details(pmd, td, origin, pmd->time);
__close_device(td);
/* * We commit to ensure the btree roots which we increment in a * moment are up to date.
*/
r = __commit_transaction(pmd); if (r < 0) {
DMWARN("%s: __commit_transaction() failed, error = %d",
__func__, r); return r;
}
/* * Wipe the spacemap since we're not publishing this.
*/
memset(&disk_super->data_space_map_root, 0, sizeof(disk_super->data_space_map_root));
memset(&disk_super->metadata_space_map_root, 0, sizeof(disk_super->metadata_space_map_root));
/* * Increment the data structures that need to be preserved.
*/
dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
dm_tm_unlock(pmd->tm, copy);
/* * Write the held root into the superblock.
*/
r = superblock_lock(pmd, &sblock); if (r) {
dm_tm_dec(pmd->tm, held_root); return r;
}
/* * Check whether @time (of block creation) is older than @td's last snapshot. * If so then the associated block is shared with the last snapshot device. * Any block on a device created *after* the device last got snapshotted is * necessarily not shared.
*/ staticbool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
{ return td->snapshotted_time > time;
}
/* * Find the mapping tree
*/
r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value); if (r) return r;
/* * Remove from the mapping tree, taking care to inc the * ref count so it doesn't get deleted.
*/
mapping_root = le64_to_cpu(value);
dm_tm_inc(pmd->tm, mapping_root);
r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root); if (r) return r;
/* * Remove leaves stops at the first unmapped entry, so we have to * loop round finding mapped ranges.
*/ while (begin < end) {
r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value); if (r == -ENODATA) break;
if (r) return r;
if (begin >= end) break;
r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count); if (r) return r;
r = dm_sm_get_nr_blocks(sm, &old_count); if (r) return r;
if (new_count == old_count) return 0;
if (new_count < old_count) {
DMERR("cannot reduce size of space map"); return -EINVAL;
}
return dm_sm_extend(sm, new_count - old_count);
}
int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
{ int r = -EINVAL;
pmd_write_lock(pmd); if (!pmd->fail_io)
r = __resize_space_map(pmd->data_sm, new_count);
pmd_write_unlock(pmd);
return r;
}
int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
{ int r = -EINVAL;
pmd_write_lock(pmd); if (!pmd->fail_io) {
r = __resize_space_map(pmd->metadata_sm, new_count); if (!r)
__set_metadata_reserve(pmd);
}
pmd_write_unlock(pmd);
int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
{ int r = -EINVAL; struct dm_block *sblock; struct thin_disk_superblock *disk_super;
pmd_write_lock(pmd); if (pmd->fail_io) goto out;
pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
r = superblock_lock(pmd, &sblock); if (r) {
DMERR("couldn't lock superblock"); goto out;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.