/* max size of osd read request, limited by libceph */ #define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN /* osd has a configurable limitation of max write size.
* CEPH_MSG_MAX_DATA_LEN should be small enough. */ #define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
/* * Delay telling the MDS we no longer want caps, in case we reopen * the file. Delay a minimum amount of time, even if we send a cap * message for some other reason. Otherwise, take the oppotunity to * update the mds to avoid sending another message later.
*/ #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
struct ceph_mount_options { unsignedint flags;
unsignedint wsize; /* max write size */ unsignedint rsize; /* max read size */ unsignedint rasize; /* max readahead */ unsignedint congestion_kb; /* max writeback in flight */ unsignedint caps_wanted_delay_min, caps_wanted_delay_max; int caps_max; unsignedint max_readdir; /* max readdir result (entries) */ unsignedint max_readdir_bytes; /* max readdir result (bytes) */
bool new_dev_syntax;
/* * everything above this point can be memcmp'd; everything below * is handled in compare_mount_options()
*/
/* * Check if the mds namespace in ceph_mount_options matches * the passed in namespace string. First time match (when * ->mds_namespace is NULL) is treated specially, since * ->mds_namespace needs to be initialized by the caller.
*/ staticinlineint namespace_equals(struct ceph_mount_options *fsopt, constchar *namespace, size_t len)
{ return !(fsopt->mds_namespace &&
(strlen(fsopt->mds_namespace) != len ||
strncmp(fsopt->mds_namespace, namespace, len)));
}
/* mount state */ enum {
CEPH_MOUNT_MOUNTING,
CEPH_MOUNT_MOUNTED,
CEPH_MOUNT_UNMOUNTING,
CEPH_MOUNT_UNMOUNTED,
CEPH_MOUNT_SHUTDOWN,
CEPH_MOUNT_RECOVER,
CEPH_MOUNT_FENCE_IO,
};
/* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read * and write data. For any given inode, we should have one or more * capabilities, one issued by each metadata server, and our * cumulative access is the OR of all issued capabilities. * * Each cap is referenced by the inode's i_caps rbtree and by per-mds * session capability lists.
*/ struct ceph_cap { struct ceph_inode_info *ci; struct rb_node ci_node; /* per-ci cap tree */ struct ceph_mds_session *session; struct list_head session_caps; /* per-session caplist */
u64 cap_id; /* unique cap id (mds provided) */ union { /* in-use caps */ struct { int issued; /* latest, from the mds */ int implemented; /* implemented superset of
issued (for revocation) */ int mds; /* mds index for this cap */ int mds_wanted; /* caps wanted from this mds */
}; /* caps to release */ struct {
u64 cap_ino; int queue_release;
};
};
u32 seq, issue_seq, mseq;
u32 cap_gen; /* active/stale cycle */ unsignedlong last_used; struct list_head caps_item;
};
#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */ #define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */ #define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */ #define CHECK_CAPS_FLUSH_FORCE 8 /* force flush any caps */
struct ceph_cap_flush {
u64 tid; int caps; bool wake; /* wake up flush waiters when finish ? */ bool is_capsnap; /* true means capsnap */ struct list_head g_list; // global struct list_head i_list; // per inode
};
/* * Snapped cap state that is pending flush to mds. When a snapshot occurs, * we first complete any in-process sync writes and writeback any dirty * data before flushing the snapped state (tracked here) back to the MDS.
*/ struct ceph_cap_snap {
refcount_t nref; struct list_head ci_item;
struct ceph_cap_flush cap_flush;
u64 follows; int issued, dirty; struct ceph_snap_context *context;
u64 size;
u64 change_attr; struct timespec64 mtime, atime, ctime, btime;
u64 time_warp_seq;
u64 truncate_size;
u32 truncate_seq; int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ bool inline_data; bool need_flush;
};
staticinlinevoid ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
{ if (refcount_dec_and_test(&capsnap->nref)) { if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
kmem_cache_free(ceph_cap_snap_cachep, capsnap);
}
}
/* * The frag tree describes how a directory is fragmented, potentially across * multiple metadata servers. It is also used to indicate points where * metadata authority is delegated, and whether/where metadata is replicated. * * A _leaf_ frag will be present in the i_fragtree IFF there is * delegation info. That is, if mds >= 0 || ndist > 0.
*/ #define CEPH_MAX_DIRFRAG_REP 4
struct ceph_inode_frag { struct rb_node node;
/* fragtree state */
u32 frag; int split_by; /* i.e. 2^(split_by) children */
/* delegation and replication info */ int mds; /* -1 if same authority as parent */ int ndist; /* >0 if replicated */ int dist[CEPH_MAX_DIRFRAG_REP];
};
/* * We cache inode xattrs as an encoded blob until they are first used, * at which point we parse them into an rbtree.
*/ struct ceph_inode_xattr { struct rb_node node;
constchar *name; int name_len; constchar *val; int val_len; int dirty;
struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing * this until someone actually calls getxattr, etc. * * blob->vec.iov_len == 4 implies there are no xattrs; blob == * NULL means we don't know.
*/ struct ceph_buffer *blob, *prealloc_blob;
struct rb_root index; bool dirty; int count; int names_size; int vals_size;
u64 version, index_version;
};
struct rb_root i_fragtree; int i_fragtree_nsplits; struct mutex i_fragtree_mutex;
struct ceph_inode_xattrs_info i_xattrs;
/* capabilities. protected _both_ by i_ceph_lock and cap->session's
* s_mutex. */ struct rb_root i_caps; /* cap list */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
/* * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty * is protected by the mdsc->cap_dirty_lock, but each individual item * is also protected by the inode's i_ceph_lock. Walking s_cap_dirty * requires the mdsc->cap_dirty_lock. List presence for an item can * be tested under the i_ceph_lock. Changing anything requires both.
*/ struct list_head i_dirty_item;
/* * Link to session's s_cap_flushing list. Protected in a similar * fashion to i_dirty_item, but also by the s_mutex for changes. The * s_cap_flushing list can be walked while holding either the s_mutex * or msdc->cap_dirty_lock. List presence can also be checked while * holding the i_ceph_lock for this inode.
*/ struct list_head i_flushing_item;
/* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */ struct ceph_cap_flush *i_prealloc_cap_flush; struct list_head i_cap_flush_list;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ unsignedlong i_hold_caps_max; /* jiffies */ struct list_head i_cap_delay_list; /* for delayed cap release to mds */ struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */
unsignedlong i_last_rd; unsignedlong i_last_wr; int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
struct mutex i_truncate_mutex;
u32 i_truncate_seq; /* last truncate to smaller size */
u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ /* * For none fscrypt case it equals to i_truncate_size or it will * equals to fscrypt_file_size
*/
u64 i_truncate_pagecache_size;
u64 i_max_size; /* max file size authorized by mds */
u64 i_reported_size; /* (max_)size reported to or requested of mds */
u64 i_wanted_max_size; /* offset we'd like to write too */
u64 i_requested_max_size; /* max_size we've requested */
/* held references to caps */ int i_pin_ref; int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head;
atomic_t i_filelock_ref;
atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */
u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
/* * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on * some arches. We generally do not use this value inside the ceph driver, but * we do want to set it to something, so that generic vfs code has an * appropriate value for tracepoints and the like.
*/ staticinline ino_t ceph_vino_to_ino_t(struct ceph_vino vino)
{ if (sizeof(ino_t) == sizeof(u32)) return ceph_ino_to_ino32(vino.ino); return (ino_t)vino.ino;
}
/* for printf-style formatting */ #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
/** * ceph_present_ino - format an inode number for presentation to userland * @sb: superblock where the inode lives * @ino: inode number to (possibly) convert * * If the user mounted with the ino32 option, then the 64-bit value needs * to be converted to something that can fit inside 32 bits. Note that * internal kernel code never uses this value, so this is entirely for * userland consumption.
*/ staticinline u64 ceph_present_ino(struct super_block *sb, u64 ino)
{ if (unlikely(ceph_test_mount_opt(ceph_sb_to_fs_client(sb), INO32))) return ceph_ino_to_ino32(ino); return ino;
}
/* * The MDS reserves a set of inodes for its own usage. These should never * be accessible by clients, and so the MDS has no reason to ever hand these * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE. * * These come from src/mds/mdstypes.h in the ceph sources.
*/ #define CEPH_MAX_MDS 0x100 #define CEPH_NUM_STRAY 10 #define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) #define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS) #define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY))
/* * NB: The hashval will be run through the fs/inode.c hash function * anyway, so there is no need to squash the inode number down to * 32-bits first. Just use low-order bits on arches with 32-bit long.
*/ return ilookup5(sb, (unsignedlong)vino.ino, ceph_ino_compare, &vino);
}
/* * Ceph inode.
*/ #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ #define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ #define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ #define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */ #define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */ #define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */ #define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */ #define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */ #define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ #define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) #define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */ #define CEPH_I_ASYNC_CHECK_CAPS (1 << 14) /* check caps immediately after async
creating finishes */
/* * We set the ERROR_WRITE bit when we start seeing write errors on an inode * and then clear it when they start succeeding. Note that we do a lockless * check first, and only take the lock if it looks like it needs to be changed. * The write submission code just takes this as a hint, so we're not too * worried if a few slip through in either direction.
*/ staticinlinevoid ceph_set_error_write(struct ceph_inode_info *ci)
{ if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) {
spin_lock(&ci->i_ceph_lock);
ci->i_ceph_flags |= CEPH_I_ERROR_WRITE;
spin_unlock(&ci->i_ceph_lock);
}
}
/* find a specific frag @f */ externstruct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
u32 f);
/* * choose fragment for value @v. copy frag content to pfrag, if leaf * exists
*/ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found);
/* readdir: position within the dir */
u32 frag; struct ceph_mds_request *last_readdir;
/* readdir: position within a frag */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ longlong dir_release_count; longlong dir_ordered_count; int readdir_cache_idx;
/* used for -o dirstat read() on directory thing */ char *dir_info; int dir_info_len;
};
/* * A "snap realm" describes a subset of the file hierarchy sharing * the same set of snapshots that apply to it. The realms themselves * are organized into a hierarchy, such that children inherit (some of) * the snapshots of their parents. * * All inodes within the realm that have capabilities are linked into a * per-realm list.
*/ struct ceph_snap_realm {
u64 ino; struct inode *inode;
atomic_t nref; struct rb_node node;
u64 created, seq;
u64 parent_ino;
u64 parent_since; /* snapid when our current parent became so */
u64 *prior_parent_snaps; /* snaps inherited from any parents we */
u32 num_prior_parent_snaps; /* had prior to parent_since */
u64 *snaps; /* snaps specific to this realm */
u32 num_snaps;
struct ceph_snap_realm *parent; struct list_head children; /* list of child realms */ struct list_head child_item;
struct list_head empty_item; /* if i have ref==0 */
struct list_head dirty_item; /* if realm needs new context */
/* * a cap_snap is "pending" if it is still awaiting an in-progress * sync write (that may/may not still update size, mtime, etc.).
*/ staticinlinebool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
{ return !list_empty(&ci->i_cap_snaps) &&
list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap,
ci_item)->writing;
}
externint __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, int want, loff_t endoff, int *got); externint ceph_get_caps(struct file *filp, int need, int want,
loff_t endoff, int *got); externint ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got);
/* for counting open files by mode */ externvoid ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count); externvoid ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count); externvoid __ceph_touch_fmode(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc, int fmode);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.