// SPDX-License-Identifier: GPL-2.0 /* * Code for working with individual keys, and sorted sets of keys with in a * btree node * * Copyright 2012 Google, Inc.
*/
/* Only copy the header, key, and one pointer. */
memcpy(dest, src, 2 * sizeof(uint64_t));
dest->ptr[0] = src->ptr[i];
SET_KEY_PTRS(dest, 1); /* We didn't copy the checksum so clear that bit. */
SET_KEY_CSUM(dest, 0);
}
bool __bch_cut_front(conststruct bkey *where, struct bkey *k)
{ unsignedint i, len = 0;
if (bkey_cmp(where, &START_KEY(k)) <= 0) returnfalse;
if (bkey_cmp(where, k) < 0)
len = KEY_OFFSET(k) - KEY_OFFSET(where); else
bkey_copy_key(k, where);
for (i = 0; i < KEY_PTRS(k); i++)
SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len);
/* * BSET_CACHELINE was originally intended to match the hardware cacheline size - * it used to be 64, but I realized the lookup code would touch slightly less * memory if it was 128. * * It definites the number of bytes (in struct bset) per struct bkey_float in * the auxiliar search tree - when we're done searching the bset_float tree we * have this many bytes left that we do a linear search over. * * Since (after level 5) every level of the bset_tree is on a new cacheline, * we're touching one fewer cacheline in the bset tree in exchange for one more * cacheline in the linear search - but the linear search might stop before it * gets to the second cacheline.
*/
#define BSET_CACHELINE 128
/* Space required for the btree node keys */ staticinline size_t btree_keys_bytes(struct btree_keys *b)
{ return PAGE_SIZE << b->page_order;
}
/* * struct btree_keys in embedded in struct btree, and struct * bset_tree is embedded into struct btree_keys. They are all * initialized as 0 by kzalloc() in mca_bucket_alloc(), and * b->set[0].data is allocated in bch_btree_keys_alloc(), so we * don't have to initiate b->set[].size and b->set[].data here * any more.
*/
}
/* Binary tree stuff for auxiliary search trees */
/* * return array index next to j when does in-order traverse * of a binary tree which is stored in a linear array
*/ staticunsignedint inorder_next(unsignedint j, unsignedint size)
{ if (j * 2 + 1 < size) {
j = j * 2 + 1;
/* * return array index previous to j when does in-order traverse * of a binary tree which is stored in a linear array
*/ staticunsignedint inorder_prev(unsignedint j, unsignedint size)
{ if (j * 2 < size) {
j = j * 2;
/* * I have no idea why this code works... and I'm the one who wrote it * * However, I do know what it does: * Given a binary tree constructed in an array (i.e. how you normally implement * a heap), it converts a node in the tree - referenced by array index - to the * index it would have if you did an inorder traversal. * * Also tested for every j, size up to size somewhere around 6 million. * * The binary tree starts at array index 1, not 0 * extra is a function of size: * extra = (size - rounddown_pow_of_two(size - 1)) << 1;
*/ staticunsignedint __to_inorder(unsignedint j, unsignedint size, unsignedint extra)
{ unsignedint b = fls(j); unsignedint shift = fls(size - 1) - b;
j ^= 1U << (b - 1);
j <<= 1;
j |= 1;
j <<= shift;
if (j > extra)
j -= (j - extra) >> 1;
return j;
}
/* * Return the cacheline index in bset_tree->data, where j is index * from a linear array which stores the auxiliar binary tree
*/ staticunsignedint to_inorder(unsignedint j, struct bset_tree *t)
{ return __to_inorder(j, t->size, t->extra);
}
/* * Return an index from a linear array which stores the auxiliar binary * tree, j is the cacheline index of t->data.
*/ staticunsignedint inorder_to_tree(unsignedint j, struct bset_tree *t)
{ return __inorder_to_tree(j, t->size, t->extra);
}
/* * Cacheline/offset <-> bkey pointer arithmetic: * * t->tree is a binary search tree in an array; each node corresponds to a key * in one cacheline in t->set (BSET_CACHELINE bytes). * * This means we don't have to store the full index of the key that a node in * the binary tree points to; to_inorder() gives us the cacheline, and then * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes. * * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to * make this work. * * To construct the bfloat for an arbitrary key we need to know what the key * immediately preceding it is: we have to check if the two keys differ in the * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size * of the previous key so we can walk backwards to it from t->tree[j]'s key.
*/
/* * For the write set - the one we're currently inserting keys into - we don't * maintain a full search tree, we just keep a simple lookup table in t->prev.
*/ staticstruct bkey *table_to_bkey(struct bset_tree *t, unsignedint cacheline)
{ return cacheline_to_bkey(t, cacheline, t->prev[cacheline]);
}
/* * Calculate mantissa value for struct bkey_float. * If most significant bit of f->exponent is not set, then * - f->exponent >> 6 is 0 * - p[0] points to bkey->low * - p[-1] borrows bits from KEY_INODE() of bkey->high * if most isgnificant bits of f->exponent is set, then * - f->exponent >> 6 is 1 * - p[0] points to bits from KEY_INODE() of bkey->high * - p[-1] points to other bits from KEY_INODE() of * bkey->high too. * See make_bfloat() to check when most significant bit of f->exponent * is set or not.
*/ staticinlineunsignedint bfloat_mantissa(conststruct bkey *k, struct bkey_float *f)
{ const uint64_t *p = &k->low - (f->exponent >> 6);
BUG_ON(m < l || m > r);
BUG_ON(bkey_next(p) != m);
/* * If l and r have different KEY_INODE values (different backing * device), f->exponent records how many least significant bits * are different in KEY_INODE values and sets most significant * bits to 1 (by +64). * If l and r have same KEY_INODE value, f->exponent records * how many different bits in least significant bits of bkey->low. * See bfloat_mantiss() how the most significant bit of * f->exponent is used to calculate bfloat mantissa value.
*/ if (KEY_INODE(l) != KEY_INODE(r))
f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; else
f->exponent = fls64(r->low ^ l->low);
/* * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to * accelerate bkey search in a btree node (pointed by bset_tree->data in * memory). After search in the auxiliar tree by calling bset_search_tree(), * a struct bset_search_iter is returned which indicates range [l, r] from * bset_tree->data where the searching bkey might be inside. Then a followed * linear comparison does the exact search, see __bch_bset_search() for how * the auxiliary tree is used.
*/ void bch_bset_build_written_tree(struct btree_keys *b)
{ struct bset_tree *t = bset_tree_last(b); struct bkey *prev = NULL, *k = t->data->start; unsignedint j, cacheline = 1;
/* First we figure out where the first key in each cacheline is */ for (j = inorder_next(0, t->size);
j;
j = inorder_next(j, t->size)) { while (bkey_to_cacheline(t, k) < cacheline) {
prev = k;
k = bkey_next(k);
}
/* We're getting called from btree_split() or btree_gc, just bail out */ if (!t->size) return;
/* * k is the key we just inserted; we need to find the entry in the * lookup table for the first key that is strictly greater than k: * it's either k's cacheline or the next one
*/ while (j < t->size &&
table_to_bkey(t, j) <= k)
j++;
/* * Adjust all the lookup table entries, and find a new key for any that * have gotten too big
*/ for (; j < t->size; j++) {
t->prev[j] += shift;
if (t->prev[j] > 7) {
k = table_to_bkey(t, j - 1);
while (k < cacheline_to_bkey(t, j, 0))
k = bkey_next(k);
if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree) return;
/* Possibly add a new entry to the end of the lookup table */
for (k = table_to_bkey(t, t->size - 1);
k != bset_bkey_last(t->data);
k = bkey_next(k)) if (t->size == bkey_to_cacheline(t, k)) {
t->prev[t->size] =
bkey_to_cacheline_offset(t, t->size, k);
t->size++;
}
}
/* * Tries to merge l and r: l should be lower than r * Returns true if we were able to merge. If we did merge, l will be the merged * key, r will be untouched.
*/ bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
{ if (!b->ops->key_merge) returnfalse;
/* * Generic header checks * Assumes left and right are in order * Left and right must be exactly aligned
*/ if (!bch_bkey_equal_header(l, r) ||
bkey_cmp(l, &START_KEY(r))) returnfalse;
/* * If k has preceding key, preceding_key_p will be set to address * of k's preceding key; otherwise preceding_key_p will be set * to NULL inside preceding_key().
*/ if (b->ops->is_extents)
preceding_key(&START_KEY(k), &preceding_key_p); else
preceding_key(k, &preceding_key_p);
m = bch_btree_iter_stack_init(b, &iter, preceding_key_p);
if (b->ops->insert_fixup(b, k, &iter.iter, replace_key)) return status;
status = BTREE_INSERT_STATUS_INSERT;
while (m != bset_bkey_last(i) &&
bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) {
prev = m;
m = bkey_next(m);
}
/* prev is in the tree, if we merge we're done */
status = BTREE_INSERT_STATUS_BACK_MERGE; if (prev &&
bch_bkey_try_merge(b, prev, k)) goto merged; #if 0
status = BTREE_INSERT_STATUS_OVERWROTE; if (m != bset_bkey_last(i) &&
KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) goto copy; #endif
status = BTREE_INSERT_STATUS_FRONT_MERGE; if (m != bset_bkey_last(i) &&
bch_bkey_try_merge(b, k, m)) goto copy;
bch_bset_insert(b, m, k);
copy: bkey_copy(m, k);
merged: return status;
}
/* Lookup */
struct bset_search_iter { struct bkey *l, *r;
};
staticstruct bset_search_iter bset_search_write_set(struct bset_tree *t, conststruct bkey *search)
{ unsignedint li = 0, ri = t->size;
while (li + 1 != ri) { unsignedint m = (li + ri) >> 1;
if (bkey_cmp(table_to_bkey(t, m), search) > 0)
ri = m; else
li = m;
}
if (likely(f->exponent != 127)) { if (f->mantissa >= bfloat_mantissa(search, f))
n = j * 2; else
n = j * 2 + 1;
} else { if (bkey_cmp(tree_to_bkey(t, j), search) > 0)
n = j * 2; else
n = j * 2 + 1;
}
} while (n < t->size);
inorder = to_inorder(j, t);
/* * n would have been the node we recursed to - the low bit tells us if * we recursed left or recursed right.
*/ if (n & 1) {
l = cacheline_to_bkey(t, inorder, f->m);
if (++inorder != t->size) {
f = &t->tree[inorder_next(j, t->size)];
r = cacheline_to_bkey(t, inorder, f->m);
} else
r = bset_bkey_last(t->data);
} else {
r = cacheline_to_bkey(t, inorder, f->m);
if (--inorder) {
f = &t->tree[inorder_prev(j, t->size)];
l = cacheline_to_bkey(t, inorder, f->m);
} else
l = t->data->start;
}
/* * First, we search for a cacheline, then lastly we do a linear search * within that cacheline. * * To search for the cacheline, there's three different possibilities: * * The set is too small to have a search tree, so we just do a linear * search over the whole set. * * The set is the one we're currently inserting into; keeping a full * auxiliary search tree up to date would be too expensive, so we * use a much simpler lookup table to do a binary search - * bset_search_write_set(). * * Or we use the auxiliary search tree we constructed earlier - * bset_search_tree()
*/
if (unlikely(!t->size)) {
i.l = t->data->start;
i.r = bset_bkey_last(t->data);
} elseif (bset_written(b, t)) { /* * Each node in the auxiliary search tree covers a certain range * of bits, and keys above and below the set it covers might * differ outside those bits - so we have to special case the * start and end - handle that here:
*/
if (unlikely(bkey_cmp(search, &t->end) >= 0)) return bset_bkey_last(t->data);
if (unlikely(bkey_cmp(search, t->data->start) < 0)) return t->data->start;
if (!start && order == b->page_order) { /* * Our temporary buffer is the same size as the btree node's * buffer, we can just swap buffers instead of doing a big * memcpy() * * Don't worry event 'out' is allocated from mempool, it can * still be swapped here. Because state->pool is a page mempool * created by mempool_init_page_pool(), which allocates * pages by alloc_pages() indeed.
*/
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.