/* * Don't update ctime and mtime. * * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
*/ #define FMODE_NOCMTIME ((__force fmode_t)(1 << 11))
/* File is embedded in backing_file object */ #define FMODE_BACKING ((__force fmode_t)(1 << 24))
/* * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be * generated (see below)
*/ #define FMODE_NONOTIFY ((__force fmode_t)(1 << 25))
/* * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be * generated (see below)
*/ #define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26))
/* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)(1 << 27))
/* File represents mount that needs unmounting */ #define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28))
/* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29))
/* * The two FMODE_NONOTIFY* define which fsnotify events should not be generated * for an open file. These are the possible values of * (f->f_mode & FMODE_FSNOTIFY_MASK) and their meaning: * * FMODE_NONOTIFY - suppress all (incl. non-permission) events. * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only FAN_ACCESS_PERM.
*/ #define FMODE_FSNOTIFY_MASK \
(FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)
/* * Attribute flags. These should be or-ed together to figure out what * has been changed!
*/ #define ATTR_MODE (1 << 0) #define ATTR_UID (1 << 1) #define ATTR_GID (1 << 2) #define ATTR_SIZE (1 << 3) #define ATTR_ATIME (1 << 4) #define ATTR_MTIME (1 << 5) #define ATTR_CTIME (1 << 6) #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ #define ATTR_CTIME_SET (1 << 10) #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) #define ATTR_KILL_PRIV (1 << 14) #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) #define ATTR_TOUCH (1 << 17) #define ATTR_DELEG (1 << 18) /* Delegated attrs. Don't break write delegations */
/* * Whiteout is represented by a char device. The following constants define the * mode and device number to use.
*/ #define WHITEOUT_MODE 0 #define WHITEOUT_DEV 0
/* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. * Also, in this manner, a Filesystem can look at only the values it cares * about. Basically, these are the attributes that the VFS layer can * request to change from the FS layer. * * Derek Atkins <warlord@MIT.EDU> 94-10-20
*/ struct iattr { unsignedint ia_valid;
umode_t ia_mode; /* * The two anonymous unions wrap structures with the same member. * * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which * are a dedicated type requiring the filesystem to use the dedicated * helpers. Other filesystem can continue to use ia_{g,u}id until they * have been ported. * * They always contain the same value. In other words FS_ALLOW_IDMAP * pass down the same value on idmapped mounts as they would on regular * mounts.
*/ union {
kuid_t ia_uid;
vfsuid_t ia_vfsuid;
}; union {
kgid_t ia_gid;
vfsgid_t ia_vfsgid;
};
loff_t ia_size; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime;
/* * Not an attribute, but an auxiliary info for filesystems wanting to * implement an ftruncate() like method. NOTE: filesystem should * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
*/ struct file *ia_file;
};
/* * Includes for diskquotas.
*/ #include <linux/quota.h>
/* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow
*/ #define FILESYSTEM_MAX_STACK_DEPTH 2
/** * enum positive_aop_returns - aop return codes with specific semantics * * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has * completed, that the page is still locked, and * should be considered active. The VM uses this hint * to return the page to the active list -- it won't * be a candidate for writeback again in the near * future. Other callers must be careful to unlock * the page if they get this return. Returned by * writepage(); * * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has * unlocked it and the page might have been truncated. * The caller should back up to acquiring a new page and * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned * by read_folio(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a * page to allow for functions that return the number of bytes operated on in a * given page.
*/
/* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) #define IOCB_DIRECT (1 << 17) #define IOCB_WRITE (1 << 18) /* iocb->ki_waitq is valid */ #define IOCB_WAITQ (1 << 19) #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) /* * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the * iocb completion can be passed back to the owner for execution from a safe * context rather than needing to be punted through a workqueue. If this * flag is set, the bio completion handling may set iocb->dio_complete to a * handler function and iocb->private to context information for that handler. * The issuer should call the handler with that context information from task * context to complete the processing of the iocb. Note that while this * provides a task context for the dio_complete() callback, it should only be * used on the completion side for non-IO generating completions. It's fine to * call blocking functions from this callback, but they should not wait for * unrelated IO (like cache flushing, new IO generation, etc).
*/ #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) #define IOCB_HAS_METADATA (1 << 24)
struct kiocb { struct file *ki_filp;
loff_t ki_pos; void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags;
u16 ki_ioprio; /* See linux/ioprio.h */
u8 ki_write_stream; union { /* * Only used for async buffered reads, where it denotes the * page waitqueue associated with completing the read. Valid * IFF IOCB_WAITQ is set.
*/ struct wait_page_queue *ki_waitq; /* * Can be used for O_DIRECT IO, where the completion handling * is punted back to the issuer of the IO. May only be set * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer * must then check for presence of this handler when ki_complete * is invoked. The data passed in to this handler must be * assigned to ->private when dio_complete is assigned.
*/
ssize_t (*dio_complete)(void *data);
};
};
/** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. * @i_pages: Cached pages. * @invalidate_lock: Guards coherency between page cache contents and * file offset->disk block mappings in the filesystem during invalidates. * It is also used to block modification of page cache contents through * memory mappings. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. * @i_private_lock: For use by the owner of the address_space. * @i_private_list: For use by the owner of the address_space. * @i_private_data: For use by the owner of the address_space.
*/ struct address_space { struct inode *host; struct xarray i_pages; struct rw_semaphore invalidate_lock;
gfp_t gfp_mask;
atomic_t i_mmap_writable; #ifdef CONFIG_READ_ONLY_THP_FOR_FS /* number of thp, only for non-shmem files */
atomic_t nr_thps; #endif struct rb_root_cached i_mmap; unsignedlong nrpages;
pgoff_t writeback_index; conststruct address_space_operations *a_ops; unsignedlong flags;
errseq_t wb_err;
spinlock_t i_private_lock; struct list_head i_private_list; struct rw_semaphore i_mmap_rwsem; void * i_private_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit * of struct folio's "mapping" pointer be used for FOLIO_MAPPING_ANON.
*/
/* XArray tags, for tagging dirty and writeback pages in the pagecache. */ #define PAGECACHE_TAG_DIRTY XA_MARK_0 #define PAGECACHE_TAG_WRITEBACK XA_MARK_1 #define PAGECACHE_TAG_TOWRITE XA_MARK_2
/* * Returns true if any of the pages in the mapping are marked with the tag.
*/ staticinlinebool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
{ return xa_marked(&mapping->i_pages, tag);
}
/* * Might pages of this file be mapped into userspace?
*/ staticinlineint mapping_mapped(struct address_space *mapping)
{ return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}
/* * Might pages of this file have been modified in userspace? * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now.
*/ staticinlineint mapping_writably_mapped(struct address_space *mapping)
{ return atomic_read(&mapping->i_mmap_writable) > 0;
}
/* * Use sequence counter to get consistent i_size on 32-bit processors.
*/ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __NEED_I_SIZE_ORDERED #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) #else #define i_size_ordered_init(inode) do { } while (0) #endif
struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) /* * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to * cache the ACL. This also means that ->get_inode_acl() can be called in RCU * mode with the LOOKUP_RCU flag.
*/ #define ACL_DONT_CACHE ((void *)(-3))
/* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode'
*/ struct inode {
umode_t i_mode; unsignedshort i_opflags;
kuid_t i_uid;
kgid_t i_gid; unsignedint i_flags;
/* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre.
*/ #define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit))
/* * __mark_inode_dirty expects inodes to be hashed. Since we don't * want special inodes in the fileset inode space, we make them * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking.
*/ staticinlinevoid inode_fake_hash(struct inode *inode)
{
hlist_add_fake(&inode->i_hash);
}
/* * inode->i_rwsem nesting subclasses for the lock validator: * * 0: the object of the current VFS operation * 1: parent * 2: child/target * 3: xattr * 4: second non-directory * 5: second parent (when locking independent directories in rename) * * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory
*/ enum inode_i_mutex_lock_class
{
I_MUTEX_NORMAL,
I_MUTEX_PARENT,
I_MUTEX_CHILD,
I_MUTEX_XATTR,
I_MUTEX_NONDIR2,
I_MUTEX_PARENT2,
};
/* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic * with respect to the local cpu (unlike with preempt disabled), * but they don't need to be atomic with respect to other cpus like in * true SMP (so they need either to either locally disable irq around * the read or for example on x86 they can be still implemented as a * cmpxchg8b without the need of the lock prefix). For SMP compiles * and 64bit archs it makes no difference if preempt is enabled or not.
*/ staticinline loff_t i_size_read(conststruct inode *inode)
{ #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
loff_t i_size; unsignedint seq;
do {
seq = read_seqcount_begin(&inode->i_size_seqcount);
i_size = inode->i_size;
} while (read_seqcount_retry(&inode->i_size_seqcount, seq)); return i_size; #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
loff_t i_size;
preempt_disable();
i_size = inode->i_size;
preempt_enable(); return i_size; #else /* Pairs with smp_store_release() in i_size_write() */ return smp_load_acquire(&inode->i_size); #endif
}
/* * NOTE: unlike i_size_read(), i_size_write() does need locking around it * (normally i_rwsem), otherwise on 32bit/SMP an update of i_size_seqcount * can be lost, resulting in subsequent i_size_read() calls spinning forever.
*/ staticinlinevoid i_size_write(struct inode *inode, loff_t i_size)
{ #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
preempt_disable();
write_seqcount_begin(&inode->i_size_seqcount);
inode->i_size = i_size;
write_seqcount_end(&inode->i_size_seqcount);
preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
preempt_disable();
inode->i_size = i_size;
preempt_enable(); #else /* * Pairs with smp_load_acquire() in i_size_read() to ensure * changes related to inode size (such as page contents) are * visible before we see the changed inode size.
*/
smp_store_release(&inode->i_size, i_size); #endif
}
struct fown_struct { struct file *file; /* backpointer for security modules */
rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */
kuid_t uid, euid; /* uid/euid of process setting the owner */ int signum; /* posix.1b rt signal to be delivered on IO */
};
/** * struct file_ra_state - Track a file's readahead state. * @start: Where the most recent readahead started. * @size: Number of pages read in the most recent readahead. * @async_size: Numer of pages that were/are not needed immediately * and so were/are genuinely "ahead". Start next readahead when * the first of these pages is accessed. * @ra_pages: Maximum size of a readahead request, copied from the bdi. * @order: Preferred folio order used for most recent readahead. * @mmap_miss: How many mmap accesses missed in the page cache. * @prev_pos: The last byte in the most recent read request. * * When this structure is passed to ->readahead(), the "most recent" * readahead means the current readahead.
*/ struct file_ra_state {
pgoff_t start; unsignedint size; unsignedint async_size; unsignedint ra_pages; unsignedshort order; unsignedshort mmap_miss;
loff_t prev_pos;
};
/* * Check if @index falls in the readahead windows.
*/ staticinlineint ra_has_index(struct file_ra_state *ra, pgoff_t index)
{ return (index >= ra->start &&
index < ra->start + ra->size);
}
/** * struct file - Represents a file * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations * @f_mapping: Contents of a cacheable, mappable object. * @private_data: filesystem or driver specific data * @f_inode: cached inode * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener * @f_owner: file owner * @f_path: path of the file * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file * @f_task_work: task work entry point * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) * @f_ref: reference count
*/ struct file {
spinlock_t f_lock;
fmode_t f_mode; conststruct file_operations *f_op; struct address_space *f_mapping; void *private_data; struct inode *f_inode; unsignedint f_flags; unsignedint f_iocb_flags; conststruct cred *f_cred; struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; union { /* regular files (with FMODE_ATOMIC_POS) and directories */ struct mutex f_pos_lock; /* pipes */
u64 f_pipe;
};
loff_t f_pos; #ifdef CONFIG_SECURITY void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */
errseq_t f_wb_err;
errseq_t f_sb_err; #ifdef CONFIG_EPOLL struct hlist_head *f_ep; #endif union { struct callback_head f_task_work; struct llist_node f_llist; struct file_ra_state f_ra;
freeptr_t f_freeptr;
};
file_ref_t f_ref; /* --- cacheline 3 boundary (192 bytes) --- */
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
/* Page cache limit. The filesystems should put that into their s_maxbytes
limits, otherwise bad things can happen in VM. */ #if BITS_PER_LONG==32 #define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT) #elif BITS_PER_LONG==64 #define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif
/* legacy typedef, should eventually be removed */ typedefvoid *fl_owner_t;
struct file_lock; struct file_lease;
/* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX #define OFFSET_MAX type_max(loff_t) #define OFFT_OFFSET_MAX type_max(off_t) #endif
/* * file_dentry() is a relic from the days that overlayfs was using files with a * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. * In those days, file_dentry() was needed to get the underlying fs dentry that * matches f_inode. * Files with "fake" path should not exist nowadays, so use an assertion to make * sure that file_dentry() was not papering over filesystem bugs.
*/ staticinlinestruct dentry *file_dentry(conststruct file *file)
{ struct dentry *dentry = file->f_path.dentry;
#define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ #define MNT_DETACH 0x00000002 /* Just detach from the tree */ #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */
/* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */
#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ #define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */
/* Possible states of 'frozen' field */ enum {
SB_UNFROZEN = 0, /* FS is unfrozen */
SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */
SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */
SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop
* internal threads if needed) */
SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */
};
#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
struct sb_writers { unsignedshort frozen; /* Is sb frozen? */ int freeze_kcount; /* How many kernel freeze requests? */ int freeze_ucount; /* How many userspace freeze requests? */ constvoid *freeze_owner; /* Owner of the freeze */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};
struct super_block { struct list_head s_list; /* Keep this first */
dev_t s_dev; /* search index; _not_ kdev_t */ unsignedchar s_blocksize_bits; unsignedlong s_blocksize;
loff_t s_maxbytes; /* Max file size */ struct file_system_type *s_type; conststruct super_operations *s_op; conststruct dquot_operations *dq_op; conststruct quotactl_ops *s_qcop; conststruct export_operations *s_export_op; unsignedlong s_flags; unsignedlong s_iflags; /* internal SB_I_* flags */ unsignedlong s_magic; struct dentry *s_root; struct rw_semaphore s_umount; int s_count;
atomic_t s_active; #ifdef CONFIG_SECURITY void *s_security; #endif conststruct xattr_handler * const *s_xattr; #ifdef CONFIG_FS_ENCRYPTION conststruct fscrypt_operations *s_cop; struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ #endif #ifdef CONFIG_FS_VERITY conststruct fsverity_operations *s_vop; #endif #if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *s_encoding;
__u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; unsignedint s_quota_types; /* Bitmask of supported quota types */ struct quota_info s_dquot; /* Diskquota specific options */
struct sb_writers s_writers;
/* * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and * s_fsnotify_info together for cache efficiency. They are frequently * accessed and rarely modified.
*/ void *s_fs_info; /* Filesystem private info */
/* Granularity of c/m/atime in ns (cannot be worse than a second) */
u32 s_time_gran; /* Time limits for c/m/atime in seconds */
time64_t s_time_min;
time64_t s_time_max; #ifdef CONFIG_FSNOTIFY
u32 s_fsnotify_mask; struct fsnotify_sb_info *s_fsnotify_info; #endif
/* * q: why are s_id and s_sysfs_name not the same? both are human * readable strings that identify the filesystem * a: s_id is allowed to change at runtime; it's used in log messages, * and we want to when a device starts out as single device (s_id is dev * name) but then a device is hot added and we have to switch to * identifying it by UUID * but s_sysfs_name is a handle for programmatic access, and can't * change at runtime
*/ char s_id[32]; /* Informational name */
uuid_t s_uuid; /* UUID */
u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */
/* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ char s_sysfs_name[UUID_STRING_LEN + 1];
unsignedint s_max_links; unsignedint s_d_flags; /* default d_flags for dentries */
/* * The next field is for VFS *only*. No filesystems have any business * even looking at it. You had been warned.
*/ struct mutex s_vfs_rename_mutex; /* Kludge */
/* * Filesystem subtype. If non-empty the filesystem type field * in /proc/mounts will be "type.subtype"
*/ constchar *s_subtype;
conststruct dentry_operations *__s_d_op; /* default d_op for dentries */
/* * Owning user namespace and default context in which to * interpret filesystem uids, gids, quotas, device nodes, * xattrs and security labels.
*/ struct user_namespace *s_user_ns;
/* * The list_lru structure is essentially just a pointer to a table * of per-node lru lists, each of which has its own spinlock. * There is no need to put them into separate cachelines.
*/ struct list_lru s_dentry_lru; struct list_lru s_inode_lru; struct rcu_head rcu; struct work_struct destroy_work;
/* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can * instead deal with the raw numeric values that are stored * in the filesystem.
*/ staticinline uid_t i_uid_read(conststruct inode *inode)
{ return from_kuid(i_user_ns(inode), inode->i_uid);
}
/** * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: whe inode's i_uid mapped down according to @idmap. * If the inode's i_uid has no mapping INVALID_VFSUID is returned.
*/ staticinline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap, conststruct inode *inode)
{ return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid);
}
/** * i_uid_needs_update - check whether inode's i_uid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_uid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_uid field needs to be updated, false if not.
*/ staticinlinebool i_uid_needs_update(struct mnt_idmap *idmap, conststruct iattr *attr, conststruct inode *inode)
{ return ((attr->ia_valid & ATTR_UID) &&
!vfsuid_eq(attr->ia_vfsuid,
i_uid_into_vfsuid(idmap, inode)));
}
/** * i_uid_update - update @inode's i_uid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_uid field translating the vfsuid of any idmapped * mount into the filesystem kuid.
*/ staticinlinevoid i_uid_update(struct mnt_idmap *idmap, conststruct iattr *attr, struct inode *inode)
{ if (attr->ia_valid & ATTR_UID)
inode->i_uid = from_vfsuid(idmap, i_user_ns(inode),
attr->ia_vfsuid);
}
/** * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: the inode's i_gid mapped down according to @idmap. * If the inode's i_gid has no mapping INVALID_VFSGID is returned.
*/ staticinline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap, conststruct inode *inode)
{ return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid);
}
/** * i_gid_needs_update - check whether inode's i_gid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_gid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_gid field needs to be updated, false if not.
*/ staticinlinebool i_gid_needs_update(struct mnt_idmap *idmap, conststruct iattr *attr, conststruct inode *inode)
{ return ((attr->ia_valid & ATTR_GID) &&
!vfsgid_eq(attr->ia_vfsgid,
i_gid_into_vfsgid(idmap, inode)));
}
/** * i_gid_update - update @inode's i_gid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_gid field translating the vfsgid of any idmapped * mount into the filesystem kgid.
*/ staticinlinevoid i_gid_update(struct mnt_idmap *idmap, conststruct iattr *attr, struct inode *inode)
{ if (attr->ia_valid & ATTR_GID)
inode->i_gid = from_vfsgid(idmap, i_user_ns(inode),
attr->ia_vfsgid);
}
/** * inode_fsuid_set - initialize inode's i_uid field with callers fsuid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_uid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsuid according to @idmap.
*/ staticinlinevoid inode_fsuid_set(struct inode *inode, struct mnt_idmap *idmap)
{
inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode));
}
/** * inode_fsgid_set - initialize inode's i_gid field with callers fsgid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_gid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsgid according to @idmap.
*/ staticinlinevoid inode_fsgid_set(struct inode *inode, struct mnt_idmap *idmap)
{
inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode));
}
/** * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped * @sb: the superblock we want a mapping in * @idmap: idmap of the relevant mount * * Check whether the caller's fsuid and fsgid have a valid mapping in the * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map * the caller's fsuid and fsgid according to the @idmap first. * * Return: true if fsuid and fsgid is mapped, false if not.
*/ staticinlinebool fsuidgid_has_mapping(struct super_block *sb, struct mnt_idmap *idmap)
{ struct user_namespace *fs_userns = sb->s_user_ns;
kuid_t kuid;
kgid_t kgid;
kuid = mapped_fsuid(idmap, fs_userns); if (!uid_valid(kuid)) returnfalse;
kgid = mapped_fsgid(idmap, fs_userns); if (!gid_valid(kgid)) returnfalse; return kuid_has_mapping(fs_userns, kuid) &&
kgid_has_mapping(fs_userns, kgid);
}
/* * Multigrain timestamps * * Conditionally use fine-grained ctime and mtime timestamps when there * are users actively observing them via getattr. The primary use-case * for this is NFS clients that use the ctime to distinguish between * different states of the file, and that are often fooled by multiple * operations that occur in the same coarse-grained timer tick.
*/ #define I_CTIME_QUERIED ((u32)BIT(31))
/** * inode_set_ctime - set the ctime in the inode * @inode: inode in which to set the ctime * @sec: tv_sec value to set * @nsec: tv_nsec value to set * * Set the ctime in @inode to { @sec, @nsec }
*/ staticinlinestruct timespec64 inode_set_ctime(struct inode *inode,
time64_t sec, long nsec)
{ struct timespec64 ts = { .tv_sec = sec,
.tv_nsec = nsec };
/** * __sb_write_started - check if sb freeze level is held * @sb: the super we write to * @level: the freeze level * * * > 0 - sb freeze level is held * * 0 - sb freeze level is not held * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
*/ staticinlineint __sb_write_started(conststruct super_block *sb, int level)
{ return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
}
/** * sb_write_started - check if SB_FREEZE_WRITE is held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
*/ staticinlinebool sb_write_started(conststruct super_block *sb)
{ return __sb_write_started(sb, SB_FREEZE_WRITE);
}
/** * sb_write_not_started - check if SB_FREEZE_WRITE is not held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
*/ staticinlinebool sb_write_not_started(conststruct super_block *sb)
{ return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
}
/** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG.
*/ staticinlinebool file_write_started(conststruct file *file)
{ if (!S_ISREG(file_inode(file)->i_mode)) returntrue; return sb_write_started(file_inode(file)->i_sb);
}
/** * file_write_not_started - check if SB_FREEZE_WRITE is not held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG.
*/ staticinlinebool file_write_not_started(conststruct file *file)
{ if (!S_ISREG(file_inode(file)->i_mode)) returntrue; return sb_write_not_started(file_inode(file)->i_sb);
}
/** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to * * Decrement number of writers to the filesystem. Wake up possible waiters * wanting to freeze the filesystem.
*/ staticinlinevoid sb_end_write(struct super_block *sb)
{
__sb_end_write(sb, SB_FREEZE_WRITE);
}
/** * sb_end_pagefault - drop write access to a superblock from a page fault * @sb: the super we wrote to * * Decrement number of processes handling write page fault to the filesystem. * Wake up possible waiters wanting to freeze the filesystem.
*/ staticinlinevoid sb_end_pagefault(struct super_block *sb)
{
__sb_end_write(sb, SB_FREEZE_PAGEFAULT);
}
/** * sb_end_intwrite - drop write access to a superblock for internal fs purposes * @sb: the super we wrote to * * Decrement fs-internal number of writers to the filesystem. Wake up possible * waiters wanting to freeze the filesystem.
*/ staticinlinevoid sb_end_intwrite(struct super_block *sb)
{
__sb_end_write(sb, SB_FREEZE_FS);
}
/** * sb_start_write - get write access to a superblock * @sb: the super we write to * * When a process wants to write data or metadata to a file system (i.e. dirty * a page or an inode), it should embed the operation in a sb_start_write() - * sb_end_write() pair to get exclusion against file system freezing. This * function increments number of writers preventing freezing. If the file * system is already frozen, the function waits until the file system is * thawed. * * Since freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. Generally, * freeze protection should be the outermost lock. In particular, we have: * * sb_start_write * -> i_rwsem (write path, truncate, directory ops, ...) * -> s_umount (freeze_super, thaw_super)
*/ staticinlinevoid sb_start_write(struct super_block *sb)
{
__sb_start_write(sb, SB_FREEZE_WRITE);
}
/** * sb_start_pagefault - get write access to a superblock from a page fault * @sb: the super we write to * * When a process starts handling write page fault, it should embed the * operation into sb_start_pagefault() - sb_end_pagefault() pair to get * exclusion against file system freezing. This is needed since the page fault * is going to dirty a page. This function increments number of running page * faults preventing freezing. If the file system is already frozen, the * function waits until the file system is thawed. * * Since page fault freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. It is advised to * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault * handling code implies lock dependency: * * mmap_lock * -> sb_start_pagefault
*/ staticinlinevoid sb_start_pagefault(struct super_block *sb)
{
__sb_start_write(sb, SB_FREEZE_PAGEFAULT);
}
/** * sb_start_intwrite - get write access to a superblock for internal fs purposes * @sb: the super we write to * * This is the third level of protection against filesystem freezing. It is * free for use by a filesystem. The only requirement is that it must rank * below sb_start_pagefault. * * For example filesystem can call sb_start_intwrite() when starting a * transaction which somewhat eases handling of freezing for internal sources * of filesystem changes (internal fs threads, discarding preallocation on file * close, etc.).
*/ staticinlinevoid sb_start_intwrite(struct super_block *sb)
{
__sb_start_write(sb, SB_FREEZE_FS);
}
/** * struct renamedata - contains all information required for renaming * @old_mnt_idmap: idmap of the old mount the inode was found from * @old_parent: parent of source * @old_dentry: source * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_parent: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags
*/ struct renamedata { struct mnt_idmap *old_mnt_idmap; struct dentry *old_parent; struct dentry *old_dentry; struct mnt_idmap *new_mnt_idmap; struct dentry *new_parent; struct dentry *new_dentry; struct inode **delegated_inode; unsignedint flags;
} __randomize_layout;
/* * This is the "filldir" function type, used by readdir() to let * the kernel specify what kind of dirent layout it wants to have. * This allows the kernel to read directories into kernel space or * to have different dirent layouts depending on the binary type. * Return 'true' to keep going and 'false' if there are no more entries.
*/ struct dir_context; typedefbool (*filldir_t)(struct dir_context *, constchar *, int, loff_t, u64, unsigned);
struct dir_context {
filldir_t actor;
loff_t pos; /* * Filesystems MUST NOT MODIFY count, but may use as a hint: * 0 unknown * > 0 space in buffer (assume at least one entry) * INT_MAX unlimited
*/ int count;
};
/* If OR-ed with d_type, pending signals are not checked */ #define FILLDIR_FLAG_NOINTR 0x1000
/* * These flags let !MMU mmap() govern direct device mapping vs immediate * copying more easily for MAP_PRIVATE, especially for ROM filesystems. * * NOMMU_MAP_COPY: Copy can be mapped (MAP_PRIVATE) * NOMMU_MAP_DIRECT: Can be mapped directly (MAP_SHARED) * NOMMU_MAP_READ: Can be mapped for reading * NOMMU_MAP_WRITE: Can be mapped for writing * NOMMU_MAP_EXEC: Can be mapped for execution
*/ #define NOMMU_MAP_COPY 0x00000001 #define NOMMU_MAP_DIRECT 0x00000008 #define NOMMU_MAP_READ VM_MAYREAD #define NOMMU_MAP_WRITE VM_MAYWRITE #define NOMMU_MAP_EXEC VM_MAYEXEC
/* * These flags control the behavior of the remap_file_range function pointer. * If it is called with len == 0 that means "remap to end of source file". * See Documentation/filesystems/vfs.rst for more details about this call. * * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
*/ #define REMAP_FILE_DEDUP (1 << 0) #define REMAP_FILE_CAN_SHORTEN (1 << 1)
/* * These flags signal that the caller is ok with altering various aspects of * the behavior of the remap operation. The changes must be made by the * implementation; the vfs remap helper functions can take advantage of them. * Flags in this category exist to preserve the quirky behavior of the hoisted * btrfs clone/dedupe ioctls.
*/ #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN)
/* * These flags control the behavior of vfs_copy_file_range(). * They are not available to the user via syscall. * * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops
*/ #define COPY_FILE_SPLICE (1 << 0)
/** * enum freeze_holder - holder of the freeze * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed * @FREEZE_EXCL: a freeze that can only be undone by the owner * * Indicate who the owner of the freeze or thaw request is and whether * the freeze needs to be exclusive or can nest. * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the * same holder aren't allowed. It is however allowed to hold a single * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at * the same time. This is relied upon by some filesystems during online * repair or similar.
*/ enum freeze_holder {
FREEZE_HOLDER_KERNEL = (1U << 0),
FREEZE_HOLDER_USERSPACE = (1U << 1),
FREEZE_MAY_NEST = (1U << 2),
FREEZE_EXCL = (1U << 3),
};
void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc); int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_super) (struct super_block *, enum freeze_holder who, constvoid *owner); int (*freeze_fs) (struct super_block *); int (*thaw_super) (struct super_block *, enum freeze_holder who, constvoid *owner); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *);
int (*show_options)(struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, constchar *, size_t, loff_t); struct dquot __rcu **(*get_dquots)(struct inode *); #endif long (*nr_cached_objects)(struct super_block *, struct shrink_control *); long (*free_cached_objects)(struct super_block *, struct shrink_control *); /* * If a filesystem can support graceful removal of a device and * continue read-write operations, implement this callback. * * Return 0 if the filesystem can continue read-write. * Non-zero return value or no such callback means the fs will be shutdown * as usual.
*/ int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); void (*shutdown)(struct super_block *sb);
};
/* * Inode flags - they have no relation to superblock flags now
*/ #define S_SYNC (1 << 0) /* Writes are synced at once */ #define S_NOATIME (1 << 1) /* Do not update access times */ #define S_APPEND (1 << 2) /* Append-only file */ #define S_IMMUTABLE (1 << 3) /* Immutable file */ #define S_DEAD (1 << 4) /* removed, but still open directory */ #define S_NOQUOTA (1 << 5) /* Inode is not counted to quota */ #define S_DIRSYNC (1 << 6) /* Directory modifications are synchronous */ #define S_NOCMTIME (1 << 7) /* Do not update file c/mtime */ #define S_SWAPFILE (1 << 8) /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE (1 << 9) /* Inode is fs-internal */ #define S_IMA (1 << 10) /* Inode has an associated IMA struct */ #define S_AUTOMOUNT (1 << 11) /* Automount/referral quasi-directory */ #define S_NOSEC (1 << 12) /* no suid or xattr security attributes */ #ifdef CONFIG_FS_DAX #define S_DAX (1 << 13) /* Direct Access, avoiding the page cache */ #else #define S_DAX 0 /* Make all the DAX code disappear */ #endif #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ #define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */ #define S_ANON_INODE (1 << 19) /* Inode is an anonymous inode */
/* * Note that nosuid etc flags are inode-specific: setting some file-system * flags just means all the inodes inherit those flags by default. It might be * possible to override it selectively if you really wanted to with some * ioctl() that is not currently implemented. * * Exception: SB_RDONLY is always applied to the entire file system. * * Unfortunately, it is possible to change a filesystems flags with it mounted * with files in use. This means that all of the inodes will not have their * i_flags updated. Hence, i_flags no longer inherit the superblock mount * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
*/ #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg))
/* * Inode state bits. Protected by inode->i_lock * * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. * * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at * various stages of removing an inode. * * Two bits are used for locking and completion notification, I_NEW and I_SYNC. * * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on * fdatasync() (unless I_DIRTY_DATASYNC is also set). * Timestamp updates are the usual cause. * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of * these changes separately from I_DIRTY_SYNC so that we * don't have to write inode on fdatasync() when only * e.g. the timestamps have changed. * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. * I_DIRTY_TIME The inode itself has dirty timestamps, and the * lazytime mount option is enabled. We keep track of this * separately from I_DIRTY_SYNC in order to implement * lazytime. This gets cleared if I_DIRTY_INODE * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already * in place because writeback might already be in progress * and we don't want to lose the time update * I_NEW Serves as both a mutex and completion notification. * New inodes set I_NEW. If two processes both create * the same inode, one of them will release its inode and * wait for I_NEW to be released before returning. * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can * also cause waiting on I_NEW, without I_NEW actually * being set. find_inode() uses this to prevent returning * nearly-dead inodes. * I_WILL_FREE Must be set when calling write_inode_now() if i_count * is zero. I_FREEING must be set when I_WILL_FREE is * cleared. * I_FREEING Set when inode is about to be freed but still has dirty * pages or buffers attached or the inode itself is still * dirty. * I_CLEAR Added by clear_inode(). In this state the inode is * clean and can be destroyed. Inode keeps I_FREEING. * * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are * prohibited for many purposes. iget() must wait for * the inode to be completely released, then create it * anew. Other functions will just ignore such inodes, * if appropriate. I_NEW is used for waiting. * * I_SYNC Writeback of inode is running. The bit is set during * data writeback, and cleared with a wakeup on the bit * address once it is done. The bit is also used to pin * the inode in memory for flusher thread. * * I_REFERENCED Marks the inode as recently references on the LRU list. * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See * inode_switch_wbs_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper * and work dirs among overlayfs mounts. * * I_CREATING New object's inode in the middle of setting up. * * I_DONTCACHE Evict inode as soon as it is not used anymore. * * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. * Used to detect that mark_inode_dirty() should not move * inode between dirty lists. * * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. * * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding * i_count. * * Q: What is the difference between I_WILL_FREE and I_FREEING? * * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait * upon. There's one free address left.
*/ #define __I_NEW 0 #define I_NEW (1 << __I_NEW) #define __I_SYNC 1 #define I_SYNC (1 << __I_SYNC) #define __I_LRU_ISOLATING 2 #define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING)
/* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. * Filesystems should call this if when writing an inode when lazytime is * enabled, they want to opportunistically write the timestamps of other inodes * located very nearby on-disk, e.g. in the same inode block. This returns true * if the given inode is in need of such an opportunistic update. Requires * i_lock, or at least later re-checking under i_lock.
*/ staticinlinebool inode_is_dirtytime_only(struct inode *inode)
{ return (inode->i_state & (I_DIRTY_TIME | I_NEW |
I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
}
/** * is_mgtime: is this inode using multigrain timestamps * @inode: inode to test for multigrain timestamps * * Return true if the inode uses multigrain timestamps, false otherwise.
*/ staticinlinebool is_mgtime(conststruct inode *inode)
{ return inode->i_opflags & IOP_MGTIME;
}
/* * This one is to be used *ONLY* from ->open() instances. * fops must be non-NULL, pinned down *and* module dependencies * should be sufficient to pin the caller down as well.
*/ #define replace_fops(f, fops) \ do { \ struct file *__file = (f); \
fops_put(__file->f_op); \
BUG_ON(!(__file->f_op = (fops))); \
} while(0)
/* set sb sysfs name based on sb->s_bdev */ staticinlinevoid super_set_sysfs_name_bdev(struct super_block *sb)
{
snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev);
}
/* set sb sysfs name based on sb->s_uuid */ staticinlinevoid super_set_sysfs_name_uuid(struct super_block *sb)
{
WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid));
snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b);
}
/* set sb sysfs name based on sb->s_id */ staticinlinevoid super_set_sysfs_name_id(struct super_block *sb)
{
strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name));
}
/* try to use something standard before you use this */
__printf(2, 3) staticinlinevoid super_set_sysfs_name_generic(struct super_block *sb, constchar *fmt, ...)
{
va_list args;
/** * is_idmapped_mnt - check whether a mount is mapped * @mnt: the mount to check * * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped. * * Return: true if mount is mapped, false if not.
*/ staticinlinebool is_idmapped_mnt(conststruct vfsmount *mnt)
{ return mnt_idmap(mnt) != &nop_mnt_idmap;
}
/* * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying * filesystem. When the mapped file path and inode number are displayed to * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the * path and inode number to display to the user, which is the path of the fd * that user has requested to map and the inode number that would be returned * by fstat() on that same fd.
*/ /* Get the path to display in /proc/<pid>/maps */ staticinlineconststruct path *file_user_path(conststruct file *f)
{ if (unlikely(f->f_mode & FMODE_BACKING)) return backing_file_user_path(f); return &f->f_path;
} /* Get the inode whose inode number to display in /proc/<pid>/maps */ staticinlineconststruct inode *file_user_inode(conststruct file *f)
{ if (unlikely(f->f_mode & FMODE_BACKING)) return d_inode(backing_file_user_path(f)->dentry); return file_inode(f);
}
/* Helper for the simple case when original dentry is used */ staticinlineint finish_open_simple(struct file *file, int error)
{ if (error) return error;
/* fs/char_dev.c */ #define CHRDEV_MAJOR_MAX 512 /* Marks the bottom of the first segment of free char majors */ #define CHRDEV_MAJOR_DYN_END 234 /* Marks the top and bottom of the second segment of free char majors */ #define CHRDEV_MAJOR_DYN_EXT_START 511 #define CHRDEV_MAJOR_DYN_EXT_END 384
/* * Sync the bytes written if this was a synchronous write. Expect ki_pos * to already be updated for the write, and will return either the amount * of bytes passed in, or an error if syncing the file failed.
*/ staticinline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
{ if (iocb_is_dsync(iocb)) { int ret = vfs_fsync_range(iocb->ki_filp,
iocb->ki_pos - count, iocb->ki_pos - 1,
(iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret;
} elseif (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping;
/** * file_start_write - get write access to a superblock for regular file io * @file: the file we want to write to * * This is a variant of sb_start_write() which is a noop on non-regualr file. * Should be matched with a call to file_end_write().
*/ staticinlinevoid file_start_write(struct file *file)
{ if (!S_ISREG(file_inode(file)->i_mode)) return;
sb_start_write(file_inode(file)->i_sb);
}
/** * file_end_write - drop write access to a superblock of a regular file * @file: the file we wrote to * * Should be matched with a call to file_start_write().
*/ staticinlinevoid file_end_write(struct file *file)
{ if (!S_ISREG(file_inode(file)->i_mode)) return;
sb_end_write(file_inode(file)->i_sb);
}
/** * kiocb_start_write - get write access to a superblock for async file io * @iocb: the io context we want to submit the write with * * This is a variant of sb_start_write() for async io submission. * Should be matched with a call to kiocb_end_write().
*/ staticinlinevoid kiocb_start_write(struct kiocb *iocb)
{ struct inode *inode = file_inode(iocb->ki_filp);
sb_start_write(inode->i_sb); /* * Fool lockdep by telling it the lock got released so that it * doesn't complain about the held lock when we return to userspace.
*/
__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
}
/** * kiocb_end_write - drop write access to a superblock after async file io * @iocb: the io context we sumbitted the write with * * Should be matched with a call to kiocb_start_write().
*/ staticinlinevoid kiocb_end_write(struct kiocb *iocb)
{ struct inode *inode = file_inode(iocb->ki_filp);
/* * Tell lockdep we inherited freeze protection from submission thread.
*/
__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
sb_end_write(inode->i_sb);
}
/* * This is used for regular files where some users -- especially the * currently executed binary in a process, previously handled via * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap * read-write shared) accesses. * * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. * deny_write_access() denies write access to a file. * allow_write_access() re-enables write access to a file. * * The i_writecount field of an inode can have the following values: * 0: no write access, no denied write access * < 0: (-i_writecount) users that denied write access to the file. * > 0: (i_writecount) users that have write access to the file. * * Normally we operate on that counter with atomic_{inc,dec} and it's safe * except for the cases where we don't hold i_writecount yet. Then we need to * use {get,deny}_write_access() - these functions check the sign and refuse * to do the change if sign is wrong.
*/ staticinlineint get_write_access(struct inode *inode)
{ return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
} staticinlineint deny_write_access(struct file *file)
{ struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
} staticinlinevoid put_write_access(struct inode * inode)
{
atomic_dec(&inode->i_writecount);
} staticinlinevoid allow_write_access(struct file *file)
{ if (file)
atomic_inc(&file_inode(file)->i_writecount);
}
/* * Do not prevent write to executable file when watched by pre-content events. * * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at * the time of file open and remains constant for entire lifetime of the file, * so if pre-content watches are added post execution or removed before the end * of the execution, it will not cause i_writecount reference leak.
*/ staticinlineint exe_file_deny_write_access(struct file *exe_file)
{ if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return 0; return deny_write_access(exe_file);
} staticinlinevoid exe_file_allow_write_access(struct file *exe_file)
{ if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return;
allow_write_access(exe_file);
}
/** * is_dot_dotdot - returns true only if @name is "." or ".." * @name: file name to check * @len: length of file name, in bytes
*/ staticinlinebool is_dot_dotdot(constchar *name, size_t len)
{ return len && unlikely(name[0] == '.') &&
(len == 1 || (len == 2 && name[1] == '.'));
}
/** * name_contains_dotdot - check if a file name contains ".." path components * * Search for ".." surrounded by either '/' or start/end of string.
*/ staticinlinebool name_contains_dotdot(constchar *name)
{
size_t name_len;
/* * Userspace may rely on the inode number being non-zero. For example, glibc * simply ignores files with zero i_ino in unlink() and other places. * * As an additional complication, if userspace was compiled with * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the * lower 32 bits, so we need to check that those aren't zero explicitly. With * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but * better safe than sorry.
*/ staticinlinebool is_zero_ino(ino_t ino)
{ return (u32)ino == 0;
}
/* * inode->i_lock must be held
*/ staticinlinevoid __iget(struct inode *inode)
{
atomic_inc(&inode->i_count);
}
/* * This must be used for allocating filesystems specific inodes to set * up the inode reclaim context correctly.
*/ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp)
/** * inode_dio_begin - signal start of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced.
*/ staticinlinevoid inode_dio_begin(struct inode *inode)
{
atomic_inc(&inode->i_dio_count);
}
/** * inode_dio_end - signal finish of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced.
*/ staticinlinevoid inode_dio_end(struct inode *inode)
{ if (atomic_dec_and_test(&inode->i_dio_count))
wake_up_var(&inode->i_dio_count);
}
#if IS_ENABLED(CONFIG_UNICODE) int generic_ci_d_hash(conststruct dentry *dentry, struct qstr *str); int generic_ci_d_compare(conststruct dentry *dentry, unsignedint len, constchar *str, conststruct qstr *name);
/** * generic_ci_validate_strict_name - Check if a given name is suitable * for a directory * * This functions checks if the proposed filename is valid for the * parent directory. That means that only valid UTF-8 filenames will be * accepted for casefold directories from filesystems created with the * strict encoding flag. That also means that any name will be * accepted for directories that doesn't have casefold enabled, or * aren't being strict with the encoding. * * @dir: inode of the directory where the new file will be created * @name: name of the new file * * Return: * * True: if the filename is suitable for this directory. It can be * true if a given name is not suitable for a strict encoding * directory, but the directory being used isn't strict * * False if the filename isn't suitable for this directory. This only * happens when a directory is casefolded and the filesystem is strict * about its encoding.
*/ staticinlinebool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
{ if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb)) returntrue;
/* * A casefold dir must have a encoding set, unless the filesystem * is corrupted
*/ if (WARN_ON_ONCE(!dir->i_sb->s_encoding)) returntrue;
if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file) returnfalse; if (!vma_is_dax(vma)) returnfalse;
inode = file_inode(vma->vm_file); if (S_ISCHR(inode->i_mode)) returnfalse; /* device-dax */ returntrue;
}
staticinlineint iocb_flags(struct file *file)
{ int res = 0; if (file->f_flags & O_APPEND)
res |= IOCB_APPEND; if (file->f_flags & O_DIRECT)
res |= IOCB_DIRECT; if (file->f_flags & O_DSYNC)
res |= IOCB_DSYNC; if (file->f_flags & __O_SYNC)
res |= IOCB_SYNC; return res;
}
staticinlineint kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, int rw_type)
{ int kiocb_flags = 0;
/* make sure there's no overlap between RWF and private IOCB flags */
BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD);
if (!flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND))) return -EINVAL;
if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP;
} if (flags & RWF_ATOMIC) { if (rw_type != WRITE) return -EOPNOTSUPP; if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP;
} if (flags & RWF_DONTCACHE) { /* file system must support it */ if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) return -EOPNOTSUPP; /* DAX mappings not supported */ if (IS_DAX(ki->ki_filp->f_mapping->host)) return -EOPNOTSUPP;
}
kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC)
kiocb_flags |= IOCB_DSYNC;
if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) { if (IS_APPEND(file_inode(ki->ki_filp))) return -EPERM;
ki->ki_flags &= ~IOCB_APPEND;
}
ki->ki_flags |= kiocb_flags; return 0;
}
/* Transaction based IO helpers */
/* * An argresp is stored in an allocated page and holds the * size of the argument or response, along with its content
*/ struct simple_transaction_argresp {
ssize_t size; char data[];
};
/* * simple attribute files * * These attributes behave similar to those in sysfs: * * Writing to an attribute immediately sets a value, an open file can be * written to multiple times. * * Reading from an attribute creates a buffer from the value that might get * read with multiple read calls. When the attribute has been read * completely, no further read calls are possible until the file is opened * again. * * All attributes contain a text representation of a numeric value * that are accessed with the get() and set() functions.
*/ #define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ staticint __fops ## _open(struct inode *inode, struct file *file) \
{ \
__simple_attr_check_format(__fmt, 0ull); \ return simple_attr_open(inode, file, __get, __set, __fmt); \
} \ staticconststruct file_operations __fops = { \
.owner = THIS_MODULE, \
.open = __fops ## _open, \
.release = simple_attr_release, \
.read = simple_attr_read, \
.write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \
.llseek = generic_file_llseek, \
}
staticinline __printf(1, 2) void __simple_attr_check_format(constchar *fmt, ...)
{ /* don't do anything, just let the compiler check the arguments; */
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.