// SPDX-License-Identifier: GPL-2.0-or-later /* vnode and volume validity verification. * * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com)
*/
/* * Data validation is managed through a number of mechanisms from the server: * * (1) On first contact with a server (such as if it has just been rebooted), * the server sends us a CB.InitCallBackState* request. * * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC * calls, the server maintains a time-limited per-vnode promise that it * will send us a CB.CallBack request if a third party alters the vnodes * accessed. * * Note that a vnode-level callbacks may also be sent for other reasons, * such as filelock release. * * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC * calls, each server maintains a time-limited per-volume promise that it * will send us a CB.CallBack request if the RO volume is updated to a * snapshot of the RW volume ("vos release"). This is an atomic event * that cuts over all instances of the RO volume across multiple servers * simultaneously. * * Note that a volume-level callbacks may also be sent for other reasons, * such as the volumeserver taking over control of the volume from the * fileserver. * * Note also that each server maintains an independent time limit on an * independent callback. * * (4) Certain RPC calls include a volume information record "VolSync" in * their reply. This contains a creation date for the volume that should * remain unchanged for a RW volume (but will be changed if the volume is * restored from backup) or will be bumped to the time of snapshotting * when a RO volume is released. * * In order to track this events, the following are provided: * * ->cb_v_break. A counter of events that might mean that the contents of * a volume have been altered since we last checked a vnode. * * ->cb_v_check. A counter of the number of events that we've sent a * query to the server for. Everything's up to date if this equals * cb_v_break. * * ->cb_scrub. A counter of the number of regression events for which we * have to completely wipe the cache. * * ->cb_ro_snapshot. A counter of the number of times that we've * recognised that a RO volume has been updated. * * ->cb_break. A counter of events that might mean that the contents of a * vnode have been altered. * * ->cb_expires_at. The time at which the callback promise expires or * AFS_NO_CB_PROMISE if we have no promise. * * The way we manage things is: * * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the * volume and volume's server record. * * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level * callback break on all the volumes that have been using that volume * (ie. increment ->cb_v_break and reset ->cb_expires_at). * * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also * dispatch a work item to unmap all PTEs to the vnode's pagecache to * force reentry to the filesystem for revalidation. * * (4) When entering the filesystem, we call afs_validate() to check the * validity of a vnode. This first checks to see if ->cb_v_check and * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock * exclusively and perform an FS.FetchStatus on the vnode. * * After checking the volume, we check the vnode. If there's a mismatch * between the volume counters and the vnode's mirrors of those counters, * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode. * * (5) When the reply from FS.FetchStatus arrives, the VolSync record is * parsed: * * (A) If the Creation timestamp has changed on a RW volume or regressed * on a RO volume, we try to increment ->cb_scrub; if it advances on a * RO volume, we assume "vos release" happened and try to increment * ->cb_ro_snapshot. * * (B) If the Update timestamp has regressed, we try to increment * ->cb_scrub. * * Note that in both of these cases, we only do the increment if we can * cmpxchg the value of the timestamp from the value we noted before the * op. This tries to prevent parallel ops from fighting one another. * * volume->cb_v_check is then set to ->cb_v_break. * * (6) The AFSCallBack record included in the FS.FetchStatus reply is also * parsed and used to set the promise in ->cb_expires_at for the vnode, * the volume and the volume's server record. * * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for * the vnode.
*/
/* * Check the validity of a vnode/inode and its parent volume.
*/ bool afs_check_validity(conststruct afs_vnode *vnode)
{ conststruct afs_volume *volume = vnode->volume; enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace;
time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at);
time64_t deadline = ktime_get_real_seconds() + 10;
if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) returntrue;
/* * See if the server we've just talked to is currently excluded.
*/ staticbool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
{ conststruct afs_server_entry *se; conststruct afs_server_list *slist; bool is_excluded = true; int i;
rcu_read_lock();
slist = rcu_dereference(volume->servers); for (i = 0; i < slist->nr_servers; i++) {
se = &slist->servers[i]; if (op->server == se->server) {
is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags); break;
}
}
rcu_read_unlock(); return is_excluded;
}
/* * Update the volume's server list when the creation time changes and see if * the server we've just talked to is currently excluded.
*/ staticint afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
{ int ret;
if (__afs_is_server_excluded(op, volume)) return 1;
set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
ret = afs_check_volume_status(op->volume, op); if (ret < 0) return ret;
return __afs_is_server_excluded(op, volume);
}
/* * Handle a change to the volume creation time in the VolSync record.
*/ staticint afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
{ unsignedint snap;
time64_t cur = volume->creation_time;
time64_t old = op->pre_volsync.creation;
time64_t new = op->volsync.creation; int ret;
/* Try to advance the creation timestamp from what we had before the * operation to what we got back from the server. This should * hopefully ensure that in a race between multiple operations only one * of them will do this.
*/ if (cur != old) return 0;
/* If the creation time changes in an unexpected way, we need to scrub * our caches. For a RW vol, this will only change if the volume is * restored from a backup; for a RO/Backup vol, this will advance when * the volume is updated to a new snapshot (eg. "vos release").
*/ if (volume->type == AFSVL_RWVOL) goto regressed; if (volume->type == AFSVL_BACKVOL) { if (new < old) goto regressed; goto advance;
}
/* We have an RO volume, we need to query the VL server and look at the * server flags to see if RW->RO replication is in progress.
*/
ret = afs_is_server_excluded(op, volume); if (ret < 0) return ret; if (ret > 0) {
snap = atomic_read(&volume->cb_ro_snapshot);
trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded); return ret;
}
/* * Handle a change to the volume update time in the VolSync record.
*/ staticvoid afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
{ enum afs_cb_break_reason reason = afs_cb_break_no_break;
time64_t cur = volume->update_time;
time64_t old = op->pre_volsync.update;
time64_t new = op->volsync.update;
if (cur == TIME64_MIN) {
volume->update_time = new; return;
}
if (new == cur) return;
/* If the volume update time changes in an unexpected way, we need to * scrub our caches. For a RW vol, this will advance on every * modification op; for a RO/Backup vol, this will advance when the * volume is updated to a new snapshot (eg. "vos release").
*/ if (new < old)
reason = afs_cb_break_for_update_regress;
/* Try to advance the update timestamp from what we had before the * operation to what we got back from the server. This should * hopefully ensure that in a race between multiple operations only one * of them will do this.
*/ if (cur == old) { if (reason == afs_cb_break_for_update_regress) {
atomic_inc(&volume->cb_scrub);
trace_afs_cb_v_break(volume->vid, 0, reason);
}
volume->update_time = new;
}
}
staticint afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
{ int ret = 0;
if (likely(op->volsync.creation == volume->creation_time &&
op->volsync.update == volume->update_time)) return 0;
mutex_lock(&volume->volsync_lock); if (op->volsync.creation != volume->creation_time) {
ret = afs_update_volume_creation_time(op, volume); if (ret < 0) goto out;
} if (op->volsync.update != volume->update_time)
afs_update_volume_update_time(op, volume);
out:
mutex_unlock(&volume->volsync_lock); return ret;
}
/* * Update the state of a volume, including recording the expiration time of the * callback promise. Returns 1 to redo the operation from the start.
*/ int afs_update_volume_state(struct afs_operation *op)
{ struct afs_server_list *slist = op->server_list; struct afs_server_entry *se = &slist->servers[op->server_index]; struct afs_callback *cb = &op->file[0].scb.callback; struct afs_volume *volume = op->volume; unsignedint cb_v_break = atomic_read(&volume->cb_v_break); unsignedint cb_v_check = atomic_read(&volume->cb_v_check); int ret;
_enter("%llx", op->volume->vid);
if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
ret = afs_update_volume_times(op, volume); if (ret != 0) {
_leave(" = %d", ret); return ret;
}
}
/* * mark the data attached to an inode as obsolete due to a write on the server * - might also want to ditch all the outstanding writes and dirty pages
*/ staticvoid afs_zap_data(struct afs_vnode *vnode)
{
_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
afs_invalidate_cache(vnode, 0);
/* nuke all the non-dirty pages that aren't locked, mapped or being * written back in a regular file and completely discard the pages in a
* directory or symlink */ if (S_ISREG(vnode->netfs.inode.i_mode))
filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX); else
filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);
}
/* * validate a vnode/inode * - there are several things we need to check * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, * symlink) * - parent dir metadata changed (security changes) * - dentry data changed (write, truncate) * - dentry metadata changed (security changes)
*/ int afs_validate(struct afs_vnode *vnode, struct key *key)
{ struct afs_volume *volume = vnode->volume; unsignedint cb_ro_snapshot, cb_scrub;
time64_t deadline = ktime_get_real_seconds() + 10; bool zap = false, locked_vol = false; int ret;
if (afs_check_validity(vnode)) return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
ret = down_write_killable(&vnode->validate_lock); if (ret < 0) goto error;
if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
ret = -ESTALE; goto error_unlock;
}
/* Validate a volume after the v_break has changed or the volume * callback expired. We only want to do this once per volume per * v_break change. The actual work will be done when parsing the * status fetch reply.
*/ if (volume->cb_expires_at <= deadline ||
atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
ret = mutex_lock_interruptible(&volume->cb_check_lock); if (ret < 0) goto error_unlock;
locked_vol = true;
}
/* if the vnode's data version number changed then its contents are
* different */
zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); if (zap)
afs_zap_data(vnode);
up_write(&vnode->validate_lock);
_leave(" = 0"); return 0;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.