// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2023-2025 Christoph Hellwig. * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
*/ #include"xfs.h" #include"xfs_shared.h" #include"xfs_format.h" #include"xfs_log_format.h" #include"xfs_trans_resv.h" #include"xfs_mount.h" #include"xfs_inode.h" #include"xfs_btree.h" #include"xfs_trans.h" #include"xfs_icache.h" #include"xfs_rmap.h" #include"xfs_rtbitmap.h" #include"xfs_rtrmap_btree.h" #include"xfs_zone_alloc.h" #include"xfs_zone_priv.h" #include"xfs_zones.h" #include"xfs_trace.h"
/* * Implement Garbage Collection (GC) of partially used zoned. * * To support the purely sequential writes in each zone, zoned XFS needs to be * able to move data remaining in a zone out of it to reset the zone to prepare * for writing to it again. * * This is done by the GC thread implemented in this file. To support that a * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to * write the garbage collected data into. * * Whenever the available space is below the chosen threshold, the GC thread * looks for potential non-empty but not fully used zones that are worth * reclaiming. Once found the rmap for the victim zone is queried, and after * a bit of sorting to reduce fragmentation, the still live extents are read * into memory and written to the GC target zone, and the bmap btree of the * files is updated to point to the new location. To avoid taking the IOLOCK * and MMAPLOCK for the entire GC process and thus affecting the latency of * user reads and writes to the files, the GC writes are speculative and the * I/O completion checks that no other writes happened for the affected regions * before remapping. * * Once a zone does not contain any valid data, be that through GC or user * block removal, it is queued for for a zone reset. The reset operation * carefully ensures that the RT device cache is flushed and all transactions * referencing the rmap have been committed to disk.
*/
/* * Size of each GC scratch pad. This is also the upper bound for each * GC I/O, which helps to keep latency down.
*/ #define XFS_GC_CHUNK_SIZE SZ_1M
/* * Scratchpad data to read GCed data into. * * The offset member tracks where the next allocation starts, and freed tracks * the amount of space that is not used anymore.
*/ #define XFS_ZONE_GC_NR_SCRATCH 2 struct xfs_zone_scratch { struct folio *folio; unsignedint offset; unsignedint freed;
};
/* * Chunk that is read and written for each GC operation. * * Note that for writes to actual zoned devices, the chunk can be split when * reaching the hardware limit.
*/ struct xfs_gc_bio { struct xfs_zone_gc_data *data;
/* * Entry into the reading/writing/resetting list. Only accessed from * the GC thread, so no locking needed.
*/ struct list_head entry;
/* * State of this gc_bio. Done means the current I/O completed. * Set from the bio end I/O handler, read from the GC thread.
*/ enum {
XFS_GC_BIO_NEW,
XFS_GC_BIO_DONE,
} state;
/* * Pointer to the inode and byte range in the inode that this * GC chunk is operating on.
*/ struct xfs_inode *ip;
loff_t offset; unsignedint len;
/* * Existing startblock (in the zone to be freed) and newly assigned * daddr in the zone GCed into.
*/
xfs_fsblock_t old_startblock;
xfs_daddr_t new_daddr; struct xfs_zone_scratch *scratch;
/* Are we writing to a sequential write required zone? */ bool is_seq;
/* Open Zone being written to */ struct xfs_open_zone *oz;
/* Bio used for reads and writes, including the bvec used by it */ struct bio_vec bv; struct bio bio; /* must be last */
};
#define XFS_ZONE_GC_RECS 1024
/* iterator, needs to be reinitialized for each victim zone */ struct xfs_zone_gc_iter { struct xfs_rtgroup *victim_rtg; unsignedint rec_count; unsignedint rec_idx;
xfs_agblock_t next_startblock; struct xfs_rmap_irec *recs;
};
/* bioset used to allocate the gc_bios */ struct bio_set bio_set;
/* * Scratchpad used, and index to indicated which one is used.
*/ struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; unsignedint scratch_idx;
/* * List of bios currently being read, written and reset. * These lists are only accessed by the GC thread itself, and must only * be processed in order.
*/ struct list_head reading; struct list_head writing; struct list_head resetting;
/* * Iterator for the victim zone.
*/ struct xfs_zone_gc_iter iter;
};
/* * We aim to keep enough zones free in stock to fully use the open zone limit * for data placement purposes. Additionally, the m_zonegc_low_space tunable * can be set to make sure a fraction of the unused blocks are available for * writing.
*/ bool
xfs_zoned_need_gc( struct xfs_mount *mp)
{
s64 available, free, threshold;
s32 remainder;
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) returnfalse;
available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
if (available <
mp->m_groups[XG_TYPE_RTG].blocks *
(mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) returntrue;
data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return NULL;
data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
GFP_KERNEL); if (!data->iter.recs) goto out_free_data;
/* * We actually only need a single bio_vec. It would be nice to have * a flag that only allocates the inline bvecs and not the separate * bvec pool.
*/ if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
BIOSET_NEED_BVECS)) goto out_free_recs; for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
data->scratch[i].folio =
folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); if (!data->scratch[i].folio) goto out_free_scratch;
}
INIT_LIST_HEAD(&data->reading);
INIT_LIST_HEAD(&data->writing);
INIT_LIST_HEAD(&data->resetting);
data->mp = mp; return data;
/* * Query the rmap of the victim zone to gather the records to evacuate.
*/ staticint
xfs_zone_gc_query_cb( struct xfs_btree_cur *cur, conststruct xfs_rmap_irec *irec, void *private)
{ struct xfs_zone_gc_iter *iter = private;
/* * Sort the rmap records by inode number and increasing offset to * defragment the mappings. * * This could be further enhanced by an even bigger look ahead window, * but that's better left until we have better detection of changes to * inode mapping to avoid the potential of GCing already dead data.
*/
sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
xfs_zone_gc_rmap_rec_cmp, NULL);
if (error == 0) { /* * We finished iterating through the zone.
*/
iter->next_startblock = rtg_blocks(rtg); if (iter->rec_count == 0) goto done;
}
/* skip zones that are just waiting for a reset */ if (rtg_rmap(rtg)->i_used_blocks == 0 ||
rtg_rmap(rtg)->i_used_blocks >= victim_used) {
xfs_rtgroup_rele(rtg); continue;
}
if (victim_rtg)
xfs_rtgroup_rele(victim_rtg);
victim_rtg = rtg;
victim_used = rtg_rmap(rtg)->i_used_blocks;
/* * Any zone that is less than 1 percent used is fair game for * instant reclaim. All of these zones are in the last * bucket, so avoid the expensive division for the zones * in the other buckets.
*/ if (bucket == 0 &&
rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) break;
}
return victim_rtg;
}
/* * Iterate through all zones marked as reclaimable and find a candidate to * reclaim.
*/ staticbool
xfs_zone_gc_select_victim( struct xfs_zone_gc_data *data)
{ struct xfs_zone_gc_iter *iter = &data->iter; struct xfs_mount *mp = data->mp; struct xfs_zone_info *zi = mp->m_zone_info; struct xfs_rtgroup *victim_rtg = NULL; unsignedint bucket;
if (xfs_is_shutdown(mp)) returnfalse;
if (iter->victim_rtg) returntrue;
/* * Don't start new work if we are asked to stop or park.
*/ if (kthread_should_stop() || kthread_should_park()) returnfalse;
if (!xfs_zoned_need_gc(mp)) returnfalse;
spin_lock(&zi->zi_used_buckets_lock); for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); if (victim_rtg) break;
}
spin_unlock(&zi->zi_used_buckets_lock);
/* * Ensure we have a valid open zone to write the GC data to. * * If the current target zone has space keep writing to it, else first wait for * all pending writes and then pick a new one.
*/ staticstruct xfs_open_zone *
xfs_zone_gc_ensure_target( struct xfs_mount *mp)
{ struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
/* * Directly allocate GC blocks from the reserved pool. * * If we'd take them from the normal pool we could be stealing blocks * from a regular writer, which would then have to wait for GC and * deadlock.
*/
spin_lock(&mp->m_sb_lock);
*count_fsb = min(*count_fsb,
rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
*count_fsb = min3(*count_fsb,
mp->m_free[XC_FREE_RTEXTENTS].res_avail,
mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
spin_unlock(&mp->m_sb_lock);
/* add right before the original chunk */
WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
list_add_tail(&split_chunk->entry, &chunk->entry); return split_chunk;
}
/* * Cycle through the iolock and wait for direct I/O and layouts to * ensure no one is reading from the old mapping before it goes away. * * Note that xfs_zoned_end_io() below checks that no other writer raced * with us to update the mapping by checking that the old startblock * didn't change.
*/
xfs_ilock(ip, iolock);
error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); if (!error)
inode_dio_wait(VFS_I(ip));
xfs_iunlock(ip, iolock); if (error) goto free;
if (chunk->is_seq)
chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
chunk->new_daddr, chunk->oz, chunk->old_startblock);
free: if (error)
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
xfs_zone_gc_free_chunk(chunk);
}
/* * Also use the bio to drive the state machine when neither * zone reset nor discard is supported to keep things simple.
*/ if (xfs_zone_gc_prepare_reset(bio, rtg))
submit_bio(bio); else
bio_endio(bio);
} while (next);
}
/* * Handle the work to read and write data for GC and to reset the zones, * including handling all completions. * * Note that the order of the chunks is preserved so that we don't undo the * optimal order established by xfs_zone_gc_query().
*/ staticbool
xfs_zone_gc_handle_work( struct xfs_zone_gc_data *data)
{ struct xfs_zone_info *zi = data->mp->m_zone_info; struct xfs_gc_bio *chunk, *next; struct xfs_group *reset_list; struct blk_plug plug;
blk_start_plug(&plug); while (xfs_zone_gc_start_chunk(data))
;
blk_finish_plug(&plug); returntrue;
}
/* * Note that the current GC algorithm would break reflinks and thus duplicate * data that was shared by multiple owners before. Because of that reflinks * are currently not supported on zoned file systems and can't be created or * mounted.
*/ staticint
xfs_zoned_gcd( void *private)
{ struct xfs_zone_gc_data *data = private; struct xfs_mount *mp = data->mp; struct xfs_zone_info *zi = mp->m_zone_info; unsignedint nofs_flag;
void
xfs_zone_gc_start( struct xfs_mount *mp)
{ if (xfs_has_zoned(mp))
kthread_unpark(mp->m_zone_info->zi_gc_thread);
}
void
xfs_zone_gc_stop( struct xfs_mount *mp)
{ if (xfs_has_zoned(mp))
kthread_park(mp->m_zone_info->zi_gc_thread);
}
int
xfs_zone_gc_mount( struct xfs_mount *mp)
{ struct xfs_zone_info *zi = mp->m_zone_info; struct xfs_zone_gc_data *data; struct xfs_open_zone *oz; int error;
/* * If there are no free zones available for GC, pick the open zone with * the least used space to GC into. This should only happen after an * unclean shutdown near ENOSPC while GC was ongoing. * * We also need to do this for the first gc zone allocation if we * unmounted while at the open limit.
*/ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
zi->zi_nr_open_zones == mp->m_max_open_zones)
oz = xfs_zone_gc_steal_open(zi); else
oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); if (!oz) {
xfs_warn(mp, "unable to allocate a zone for gc");
error = -EIO; goto out;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.