Quelle xfs_zone_gc.c

Sprache: C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023-2025 Christoph Hellwig.
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
*/
#include "xfs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
#include "xfs_icache.h"
#include "xfs_rmap.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
#include "xfs_zones.h"
#include "xfs_trace.h"

/*
* Implement Garbage Collection (GC) of partially used zoned.
*
* To support the purely sequential writes in each zone, zoned XFS needs to be
* able to move data remaining in a zone out of it to reset the zone to prepare
* for writing to it again.
*
* This is done by the GC thread implemented in this file.  To support that a
* number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
* write the garbage collected data into.
*
* Whenever the available space is below the chosen threshold, the GC thread
* looks for potential non-empty but not fully used zones that are worth
* reclaiming.  Once found the rmap for the victim zone is queried, and after
* a bit of sorting to reduce fragmentation, the still live extents are read
* into memory and written to the GC target zone, and the bmap btree of the
* files is updated to point to the new location.  To avoid taking the IOLOCK
* and MMAPLOCK for the entire GC process and thus affecting the latency of
* user reads and writes to the files, the GC writes are speculative and the
* I/O completion checks that no other writes happened for the affected regions
* before remapping.
*
* Once a zone does not contain any valid data, be that through GC or user
* block removal, it is queued for for a zone reset.  The reset operation
* carefully ensures that the RT device cache is flushed and all transactions
* referencing the rmap have been committed to disk.
*/

/*
* Size of each GC scratch pad.  This is also the upper bound for each
* GC I/O, which helps to keep latency down.
*/
#define XFS_GC_CHUNK_SIZE SZ_1M

/*
* Scratchpad data to read GCed data into.
*
* The offset member tracks where the next allocation starts, and freed tracks
* the amount of space that is not used anymore.
*/
#define XFS_ZONE_GC_NR_SCRATCH 2
struct xfs_zone_scratch {
struct folio   *folio;
unsigned int   offset;
unsigned int   freed;
};

/*
* Chunk that is read and written for each GC operation.
*
* Note that for writes to actual zoned devices, the chunk can be split when
* reaching the hardware limit.
*/
struct xfs_gc_bio {
struct xfs_zone_gc_data  *data;

/*
* Entry into the reading/writing/resetting list.  Only accessed from
* the GC thread, so no locking needed.
*/
struct list_head  entry;

/*
* State of this gc_bio.  Done means the current I/O completed.
* Set from the bio end I/O handler, read from the GC thread.
*/
enum {
  XFS_GC_BIO_NEW,
  XFS_GC_BIO_DONE,
} state;

/*
* Pointer to the inode and byte range in the inode that this
* GC chunk is operating on.
*/
struct xfs_inode  *ip;
loff_t    offset;
unsigned int   len;

/*
* Existing startblock (in the zone to be freed) and newly assigned
* daddr in the zone GCed into.
*/
xfs_fsblock_t   old_startblock;
xfs_daddr_t   new_daddr;
struct xfs_zone_scratch  *scratch;

/* Are we writing to a sequential write required zone? */
bool    is_seq;

/* Open Zone being written to */
struct xfs_open_zone  *oz;

/* Bio used for reads and writes, including the bvec used by it */
struct bio_vec   bv;
struct bio   bio; /* must be last */
};

#define XFS_ZONE_GC_RECS  1024

/* iterator, needs to be reinitialized for each victim zone */
struct xfs_zone_gc_iter {
struct xfs_rtgroup  *victim_rtg;
unsigned int   rec_count;
unsigned int   rec_idx;
xfs_agblock_t   next_startblock;
struct xfs_rmap_irec  *recs;
};

/*
* Per-mount GC state.
*/
struct xfs_zone_gc_data {
struct xfs_mount  *mp;

/* bioset used to allocate the gc_bios */
struct bio_set   bio_set;

/*
* Scratchpad used, and index to indicated which one is used.
*/
struct xfs_zone_scratch  scratch[XFS_ZONE_GC_NR_SCRATCH];
unsigned int   scratch_idx;

/*
* List of bios currently being read, written and reset.
* These lists are only accessed by the GC thread itself, and must only
* be processed in order.
*/
struct list_head  reading;
struct list_head  writing;
struct list_head  resetting;

/*
* Iterator for the victim zone.
*/
struct xfs_zone_gc_iter  iter;
};

/*
* We aim to keep enough zones free in stock to fully use the open zone limit
* for data placement purposes. Additionally, the m_zonegc_low_space tunable
* can be set to make sure a fraction of the unused blocks are available for
* writing.
*/
bool
xfs_zoned_need_gc(
struct xfs_mount *mp)
{
s64   available, free, threshold;
s32   remainder;

if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
  return false;

available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);

if (available <
     mp->m_groups[XG_TYPE_RTG].blocks *
     (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
  return true;

free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);

threshold = div_s64_rem(free, 100, &remainder);
threshold = threshold * mp->m_zonegc_low_space +
      remainder * div_s64(mp->m_zonegc_low_space, 100);

if (available < threshold)
  return true;

return false;
}

static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(
struct xfs_mount *mp)
{
struct xfs_zone_gc_data *data;
int   i;

data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
  return NULL;
data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
   GFP_KERNEL);
if (!data->iter.recs)
  goto out_free_data;

/*
* We actually only need a single bio_vec.  It would be nice to have
* a flag that only allocates the inline bvecs and not the separate
* bvec pool.
*/
if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
   BIOSET_NEED_BVECS))
  goto out_free_recs;
for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
  data->scratch[i].folio =
   folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
  if (!data->scratch[i].folio)
   goto out_free_scratch;
}
INIT_LIST_HEAD(&data->reading);
INIT_LIST_HEAD(&data->writing);
INIT_LIST_HEAD(&data->resetting);
data->mp = mp;
return data;

out_free_scratch:
while (--i >= 0)
  folio_put(data->scratch[i].folio);
bioset_exit(&data->bio_set);
out_free_recs:
kfree(data->iter.recs);
out_free_data:
kfree(data);
return NULL;
}

static void
xfs_zone_gc_data_free(
struct xfs_zone_gc_data *data)
{
int   i;

for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
  folio_put(data->scratch[i].folio);
bioset_exit(&data->bio_set);
kfree(data->iter.recs);
kfree(data);
}

static void
xfs_zone_gc_iter_init(
struct xfs_zone_gc_iter *iter,
struct xfs_rtgroup *victim_rtg)

{
iter->next_startblock = 0;
iter->rec_count = 0;
iter->rec_idx = 0;
iter->victim_rtg = victim_rtg;
}

/*
* Query the rmap of the victim zone to gather the records to evacuate.
*/
static int
xfs_zone_gc_query_cb(
struct xfs_btree_cur *cur,
const struct xfs_rmap_irec *irec,
void   *private)
{
struct xfs_zone_gc_iter *iter = private;

ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));

iter->recs[iter->rec_count] = *irec;
if (++iter->rec_count == XFS_ZONE_GC_RECS) {
  iter->next_startblock =
   irec->rm_startblock + irec->rm_blockcount;
  return 1;
}
return 0;
}

static int
xfs_zone_gc_rmap_rec_cmp(
const void   *a,
const void   *b)
{
const struct xfs_rmap_irec *reca = a;
const struct xfs_rmap_irec *recb = b;
int    diff;

diff = cmp_int(reca->rm_owner, recb->rm_owner);
if (diff)
  return diff;
return cmp_int(reca->rm_offset, recb->rm_offset);
}

static int
xfs_zone_gc_query(
struct xfs_mount *mp,
struct xfs_zone_gc_iter *iter)
{
struct xfs_rtgroup *rtg = iter->victim_rtg;
struct xfs_rmap_irec ri_low = { };
struct xfs_rmap_irec ri_high;
struct xfs_btree_cur *cur;
struct xfs_trans *tp;
int   error;

ASSERT(iter->next_startblock <= rtg_blocks(rtg));
if (iter->next_startblock == rtg_blocks(rtg))
  goto done;

ASSERT(iter->next_startblock < rtg_blocks(rtg));
ri_low.rm_startblock = iter->next_startblock;
memset(&ri_high, 0xFF, sizeof(ri_high));

iter->rec_idx = 0;
iter->rec_count = 0;

tp = xfs_trans_alloc_empty(mp);
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
cur = xfs_rtrmapbt_init_cursor(tp, rtg);
error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
   xfs_zone_gc_query_cb, iter);
xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
xfs_btree_del_cursor(cur, error < 0 ? error : 0);
xfs_trans_cancel(tp);

if (error < 0)
  return error;

/*
* Sort the rmap records by inode number and increasing offset to
* defragment the mappings.
*
* This could be further enhanced by an even bigger look ahead window,
* but that's better left until we have better detection of changes to
* inode mapping to avoid the potential of GCing already dead data.
*/
sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
   xfs_zone_gc_rmap_rec_cmp, NULL);

if (error == 0) {
  /*
* We finished iterating through the zone.
*/
  iter->next_startblock = rtg_blocks(rtg);
  if (iter->rec_count == 0)
   goto done;
}

return 0;
done:
xfs_rtgroup_rele(iter->victim_rtg);
iter->victim_rtg = NULL;
return 0;
}

static bool
xfs_zone_gc_iter_next(
struct xfs_mount *mp,
struct xfs_zone_gc_iter *iter,
struct xfs_rmap_irec *chunk_rec,
struct xfs_inode **ipp)
{
struct xfs_rmap_irec *irec;
int   error;

if (!iter->victim_rtg)
  return false;

retry:
if (iter->rec_idx == iter->rec_count) {
  error = xfs_zone_gc_query(mp, iter);
  if (error)
   goto fail;
  if (!iter->victim_rtg)
   return false;
}

irec = &iter->recs[iter->rec_idx];
error = xfs_iget(mp, NULL, irec->rm_owner,
   XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
if (error) {
  /*
* If the inode was already deleted, skip over it.
*/
  if (error == -ENOENT) {
   iter->rec_idx++;
   goto retry;
  }
  goto fail;
}

if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
  iter->rec_idx++;
  xfs_irele(*ipp);
  goto retry;
}

*chunk_rec = *irec;
return true;

fail:
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
return false;
}

static void
xfs_zone_gc_iter_advance(
struct xfs_zone_gc_iter *iter,
xfs_extlen_t  count_fsb)
{
struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];

irec->rm_offset += count_fsb;
irec->rm_startblock += count_fsb;
irec->rm_blockcount -= count_fsb;
if (!irec->rm_blockcount)
  iter->rec_idx++;
}

static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(
struct xfs_mount *mp,
uint32_t  bucket)
{
struct xfs_zone_info *zi = mp->m_zone_info;
uint32_t  victim_used = U32_MAX;
struct xfs_rtgroup *victim_rtg = NULL;
uint32_t  bit;

if (!zi->zi_used_bucket_entries[bucket])
  return NULL;

for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
   mp->m_sb.sb_rgcount) {
  struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);

  if (!rtg)
   continue;

  /* skip zones that are just waiting for a reset */
  if (rtg_rmap(rtg)->i_used_blocks == 0 ||
      rtg_rmap(rtg)->i_used_blocks >= victim_used) {
   xfs_rtgroup_rele(rtg);
   continue;
  }

  if (victim_rtg)
   xfs_rtgroup_rele(victim_rtg);
  victim_rtg = rtg;
  victim_used = rtg_rmap(rtg)->i_used_blocks;

  /*
* Any zone that is less than 1 percent used is fair game for
* instant reclaim. All of these zones are in the last
* bucket, so avoid the expensive division for the zones
* in the other buckets.
*/
  if (bucket == 0 &&
      rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
   break;
}

return victim_rtg;
}

/*
* Iterate through all zones marked as reclaimable and find a candidate to
* reclaim.
*/
static bool
xfs_zone_gc_select_victim(
struct xfs_zone_gc_data *data)
{
struct xfs_zone_gc_iter *iter = &data->iter;
struct xfs_mount *mp = data->mp;
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_rtgroup *victim_rtg = NULL;
unsigned int  bucket;

if (xfs_is_shutdown(mp))
  return false;

if (iter->victim_rtg)
  return true;

/*
* Don't start new work if we are asked to stop or park.
*/
if (kthread_should_stop() || kthread_should_park())
  return false;

if (!xfs_zoned_need_gc(mp))
  return false;

spin_lock(&zi->zi_used_buckets_lock);
for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
  victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
  if (victim_rtg)
   break;
}
spin_unlock(&zi->zi_used_buckets_lock);

if (!victim_rtg)
  return false;

trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
xfs_zone_gc_iter_init(iter, victim_rtg);
return true;
}

static struct xfs_open_zone *
xfs_zone_gc_steal_open(
struct xfs_zone_info *zi)
{
struct xfs_open_zone *oz, *found = NULL;

spin_lock(&zi->zi_open_zones_lock);
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
  if (!found || oz->oz_allocated < found->oz_allocated)
   found = oz;
}

if (found) {
  found->oz_is_gc = true;
  list_del_init(&found->oz_entry);
  zi->zi_nr_open_zones--;
}

spin_unlock(&zi->zi_open_zones_lock);
return found;
}

static struct xfs_open_zone *
xfs_zone_gc_select_target(
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_open_zone *oz = zi->zi_open_gc_zone;

/*
* We need to wait for pending writes to finish.
*/
if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
  return NULL;

ASSERT(zi->zi_nr_open_zones <=
  mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
if (oz)
  trace_xfs_zone_gc_target_opened(oz->oz_rtg);
spin_lock(&zi->zi_open_zones_lock);
zi->zi_open_gc_zone = oz;
spin_unlock(&zi->zi_open_zones_lock);
return oz;
}

/*
* Ensure we have a valid open zone to write the GC data to.
*
* If the current target zone has space keep writing to it, else first wait for
* all pending writes and then pick a new one.
*/
static struct xfs_open_zone *
xfs_zone_gc_ensure_target(
struct xfs_mount *mp)
{
struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;

if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
  return xfs_zone_gc_select_target(mp);
return oz;
}

static unsigned int
xfs_zone_gc_scratch_available(
struct xfs_zone_gc_data *data)
{
return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
}

static bool
xfs_zone_gc_space_available(
struct xfs_zone_gc_data *data)
{
struct xfs_open_zone *oz;

oz = xfs_zone_gc_ensure_target(data->mp);
if (!oz)
  return false;
return oz->oz_allocated < rtg_blocks(oz->oz_rtg) &&
  xfs_zone_gc_scratch_available(data);
}

static void
xfs_zone_gc_end_io(
struct bio  *bio)
{
struct xfs_gc_bio *chunk =
  container_of(bio, struct xfs_gc_bio, bio);
struct xfs_zone_gc_data *data = chunk->data;

WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
wake_up_process(data->mp->m_zone_info->zi_gc_thread);
}

static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(
struct xfs_zone_gc_data *data,
xfs_extlen_t  *count_fsb,
xfs_daddr_t  *daddr,
bool   *is_seq)
{
struct xfs_mount *mp = data->mp;
struct xfs_open_zone *oz;

oz = xfs_zone_gc_ensure_target(mp);
if (!oz)
  return NULL;

*count_fsb = min(*count_fsb,
  XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));

/*
* Directly allocate GC blocks from the reserved pool.
*
* If we'd take them from the normal pool we could be stealing blocks
* from a regular writer, which would then have to wait for GC and
* deadlock.
*/
spin_lock(&mp->m_sb_lock);
*count_fsb = min(*count_fsb,
   rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
*count_fsb = min3(*count_fsb,
   mp->m_free[XC_FREE_RTEXTENTS].res_avail,
   mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
spin_unlock(&mp->m_sb_lock);

if (!*count_fsb)
  return NULL;

*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
if (!*is_seq)
  *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
oz->oz_allocated += *count_fsb;
atomic_inc(&oz->oz_ref);
return oz;
}

static bool
xfs_zone_gc_start_chunk(
struct xfs_zone_gc_data *data)
{
struct xfs_zone_gc_iter *iter = &data->iter;
struct xfs_mount *mp = data->mp;
struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
struct xfs_open_zone *oz;
struct xfs_rmap_irec irec;
struct xfs_gc_bio *chunk;
struct xfs_inode *ip;
struct bio  *bio;
xfs_daddr_t  daddr;
bool   is_seq;

if (xfs_is_shutdown(mp))
  return false;

if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
  return false;
oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
   &is_seq);
if (!oz) {
  xfs_irele(ip);
  return false;
}

bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);

chunk = container_of(bio, struct xfs_gc_bio, bio);
chunk->ip = ip;
chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
chunk->old_startblock =
  xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
chunk->new_daddr = daddr;
chunk->is_seq = is_seq;
chunk->scratch = &data->scratch[data->scratch_idx];
chunk->data = data;
chunk->oz = oz;

bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
bio->bi_end_io = xfs_zone_gc_end_io;
bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
   chunk->scratch->offset);
chunk->scratch->offset += chunk->len;
if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
  data->scratch_idx =
   (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
}
WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
list_add_tail(&chunk->entry, &data->reading);
xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);

submit_bio(bio);
return true;
}

static void
xfs_zone_gc_free_chunk(
struct xfs_gc_bio *chunk)
{
list_del(&chunk->entry);
xfs_open_zone_put(chunk->oz);
xfs_irele(chunk->ip);
bio_put(&chunk->bio);
}

static void
xfs_zone_gc_submit_write(
struct xfs_zone_gc_data *data,
struct xfs_gc_bio *chunk)
{
if (chunk->is_seq) {
  chunk->bio.bi_opf &= ~REQ_OP_WRITE;
  chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
}
chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
chunk->bio.bi_end_io = xfs_zone_gc_end_io;
submit_bio(&chunk->bio);
}

static struct xfs_gc_bio *
xfs_zone_gc_split_write(
struct xfs_zone_gc_data *data,
struct xfs_gc_bio *chunk)
{
struct queue_limits *lim =
  &bdev_get_queue(chunk->bio.bi_bdev)->limits;
struct xfs_gc_bio *split_chunk;
int   split_sectors;
unsigned int  split_len;
struct bio  *split;
unsigned int  nsegs;

if (!chunk->is_seq)
  return NULL;

split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
   lim->max_zone_append_sectors << SECTOR_SHIFT);
if (!split_sectors)
  return NULL;

/* ensure the split chunk is still block size aligned */
split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
   data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
split_len = split_sectors << SECTOR_SHIFT;

split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
split_chunk = container_of(split, struct xfs_gc_bio, bio);
split_chunk->data = data;
ihold(VFS_I(chunk->ip));
split_chunk->ip = chunk->ip;
split_chunk->is_seq = chunk->is_seq;
split_chunk->scratch = chunk->scratch;
split_chunk->offset = chunk->offset;
split_chunk->len = split_len;
split_chunk->old_startblock = chunk->old_startblock;
split_chunk->new_daddr = chunk->new_daddr;
split_chunk->oz = chunk->oz;
atomic_inc(&chunk->oz->oz_ref);

chunk->offset += split_len;
chunk->len -= split_len;
chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);

/* add right before the original chunk */
WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
list_add_tail(&split_chunk->entry, &chunk->entry);
return split_chunk;
}

static void
xfs_zone_gc_write_chunk(
struct xfs_gc_bio *chunk)
{
struct xfs_zone_gc_data *data = chunk->data;
struct xfs_mount *mp = chunk->ip->i_mount;
phys_addr_t  bvec_paddr =
  bvec_phys(bio_first_bvec_all(&chunk->bio));
struct xfs_gc_bio *split_chunk;

if (chunk->bio.bi_status)
  xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
if (xfs_is_shutdown(mp)) {
  xfs_zone_gc_free_chunk(chunk);
  return;
}

WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
list_move_tail(&chunk->entry, &data->writing);

bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
   offset_in_folio(chunk->scratch->folio, bvec_paddr));

while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
  xfs_zone_gc_submit_write(data, split_chunk);
xfs_zone_gc_submit_write(data, chunk);
}

static void
xfs_zone_gc_finish_chunk(
struct xfs_gc_bio *chunk)
{
uint   iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
struct xfs_inode *ip = chunk->ip;
struct xfs_mount *mp = ip->i_mount;
int   error;

if (chunk->bio.bi_status)
  xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
if (xfs_is_shutdown(mp)) {
  xfs_zone_gc_free_chunk(chunk);
  return;
}

chunk->scratch->freed += chunk->len;
if (chunk->scratch->freed == chunk->scratch->offset) {
  chunk->scratch->offset = 0;
  chunk->scratch->freed = 0;
}

/*
* Cycle through the iolock and wait for direct I/O and layouts to
* ensure no one is reading from the old mapping before it goes away.
*
* Note that xfs_zoned_end_io() below checks that no other writer raced
* with us to update the mapping by checking that the old startblock
* didn't change.
*/
xfs_ilock(ip, iolock);
error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
if (!error)
  inode_dio_wait(VFS_I(ip));
xfs_iunlock(ip, iolock);
if (error)
  goto free;

if (chunk->is_seq)
  chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
   chunk->new_daddr, chunk->oz, chunk->old_startblock);
free:
if (error)
  xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
xfs_zone_gc_free_chunk(chunk);
}

static void
xfs_zone_gc_finish_reset(
struct xfs_gc_bio *chunk)
{
struct xfs_rtgroup *rtg = chunk->bio.bi_private;
struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_zone_info *zi = mp->m_zone_info;

if (chunk->bio.bi_status) {
  xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
  goto out;
}

xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
atomic_inc(&zi->zi_nr_free_zones);

xfs_zoned_add_available(mp, rtg_blocks(rtg));

wake_up_all(&zi->zi_zone_wait);
out:
list_del(&chunk->entry);
bio_put(&chunk->bio);
}

static bool
xfs_zone_gc_prepare_reset(
struct bio  *bio,
struct xfs_rtgroup *rtg)
{
trace_xfs_zone_reset(rtg);

ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
  if (!bdev_max_discard_sectors(bio->bi_bdev))
   return false;
  bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
  bio->bi_iter.bi_size =
   XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
}

return true;
}

int
xfs_zone_gc_reset_sync(
struct xfs_rtgroup *rtg)
{
int   error = 0;
struct bio  bio;

bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
   REQ_OP_ZONE_RESET);
if (xfs_zone_gc_prepare_reset(&bio, rtg))
  error = submit_bio_wait(&bio);
bio_uninit(&bio);

return error;
}

static void
xfs_zone_gc_reset_zones(
struct xfs_zone_gc_data *data,
struct xfs_group *reset_list)
{
struct xfs_group *next = reset_list;

if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
  xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
  return;
}

do {
  struct xfs_rtgroup *rtg = to_rtg(next);
  struct xfs_gc_bio *chunk;
  struct bio  *bio;

  xfs_log_force_inode(rtg_rmap(rtg));

  next = rtg_group(rtg)->xg_next_reset;
  rtg_group(rtg)->xg_next_reset = NULL;

  bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
    0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
  bio->bi_private = rtg;
  bio->bi_end_io = xfs_zone_gc_end_io;

  chunk = container_of(bio, struct xfs_gc_bio, bio);
  chunk->data = data;
  WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
  list_add_tail(&chunk->entry, &data->resetting);

  /*
* Also use the bio to drive the state machine when neither
* zone reset nor discard is supported to keep things simple.
*/
  if (xfs_zone_gc_prepare_reset(bio, rtg))
   submit_bio(bio);
  else
   bio_endio(bio);
} while (next);
}

/*
* Handle the work to read and write data for GC and to reset the zones,
* including handling all completions.
*
* Note that the order of the chunks is preserved so that we don't undo the
* optimal order established by xfs_zone_gc_query().
*/
static bool
xfs_zone_gc_handle_work(
struct xfs_zone_gc_data *data)
{
struct xfs_zone_info *zi = data->mp->m_zone_info;
struct xfs_gc_bio *chunk, *next;
struct xfs_group *reset_list;
struct blk_plug  plug;

spin_lock(&zi->zi_reset_list_lock);
reset_list = zi->zi_reset_list;
zi->zi_reset_list = NULL;
spin_unlock(&zi->zi_reset_list_lock);

if (!xfs_zone_gc_select_victim(data) ||
     !xfs_zone_gc_space_available(data)) {
  if (list_empty(&data->reading) &&
      list_empty(&data->writing) &&
      list_empty(&data->resetting) &&
      !reset_list)
   return false;
}

__set_current_state(TASK_RUNNING);
try_to_freeze();

if (reset_list)
  xfs_zone_gc_reset_zones(data, reset_list);

list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
  if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
   break;
  xfs_zone_gc_finish_reset(chunk);
}

list_for_each_entry_safe(chunk, next, &data->writing, entry) {
  if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
   break;
  xfs_zone_gc_finish_chunk(chunk);
}

blk_start_plug(&plug);
list_for_each_entry_safe(chunk, next, &data->reading, entry) {
  if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
   break;
  xfs_zone_gc_write_chunk(chunk);
}
blk_finish_plug(&plug);

blk_start_plug(&plug);
while (xfs_zone_gc_start_chunk(data))
  ;
blk_finish_plug(&plug);
return true;
}

/*
* Note that the current GC algorithm would break reflinks and thus duplicate
* data that was shared by multiple owners before.  Because of that reflinks
* are currently not supported on zoned file systems and can't be created or
* mounted.
*/
static int
xfs_zoned_gcd(
void   *private)
{
struct xfs_zone_gc_data *data = private;
struct xfs_mount *mp = data->mp;
struct xfs_zone_info *zi = mp->m_zone_info;
unsigned int  nofs_flag;

nofs_flag = memalloc_nofs_save();
set_freezable();

for (;;) {
  set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
  xfs_set_zonegc_running(mp);
  if (xfs_zone_gc_handle_work(data))
   continue;

  if (list_empty(&data->reading) &&
      list_empty(&data->writing) &&
      list_empty(&data->resetting) &&
      !zi->zi_reset_list) {
   xfs_clear_zonegc_running(mp);
   xfs_zoned_resv_wake_all(mp);

   if (kthread_should_stop()) {
    __set_current_state(TASK_RUNNING);
    break;
   }

   if (kthread_should_park()) {
    __set_current_state(TASK_RUNNING);
    kthread_parkme();
    continue;
   }
  }

  schedule();
}
xfs_clear_zonegc_running(mp);

if (data->iter.victim_rtg)
  xfs_rtgroup_rele(data->iter.victim_rtg);

memalloc_nofs_restore(nofs_flag);
xfs_zone_gc_data_free(data);
return 0;
}

void
xfs_zone_gc_start(
struct xfs_mount *mp)
{
if (xfs_has_zoned(mp))
  kthread_unpark(mp->m_zone_info->zi_gc_thread);
}

void
xfs_zone_gc_stop(
struct xfs_mount *mp)
{
if (xfs_has_zoned(mp))
  kthread_park(mp->m_zone_info->zi_gc_thread);
}

int
xfs_zone_gc_mount(
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_zone_gc_data *data;
struct xfs_open_zone *oz;
int   error;

/*
* If there are no free zones available for GC, pick the open zone with
* the least used space to GC into.  This should only happen after an
* unclean shutdown near ENOSPC while GC was ongoing.
*
* We also need to do this for the first gc zone allocation if we
* unmounted while at the open limit.
*/
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
     zi->zi_nr_open_zones == mp->m_max_open_zones)
  oz = xfs_zone_gc_steal_open(zi);
else
  oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
if (!oz) {
  xfs_warn(mp, "unable to allocate a zone for gc");
  error = -EIO;
  goto out;
}

trace_xfs_zone_gc_target_opened(oz->oz_rtg);
zi->zi_open_gc_zone = oz;

data = xfs_zone_gc_data_alloc(mp);
if (!data) {
  error = -ENOMEM;
  goto out_put_gc_zone;
}

mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
   "xfs-zone-gc/%s", mp->m_super->s_id);
if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
  xfs_warn(mp, "unable to create zone gc thread");
  error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
  goto out_free_gc_data;
}

/* xfs_zone_gc_start will unpark for rw mounts */
kthread_park(mp->m_zone_info->zi_gc_thread);
return 0;

out_free_gc_data:
kfree(data);
out_put_gc_zone:
xfs_open_zone_put(zi->zi_open_gc_zone);
out:
return error;
}

void
xfs_zone_gc_unmount(
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;

kthread_stop(zi->zi_gc_thread);
if (zi->zi_open_gc_zone)
  xfs_open_zone_put(zi->zi_open_gc_zone);
}

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.12 Sekunden (vorverarbeitet am 2026-06-08) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.