// SPDX-License-Identifier: GPL-2.0 /* * linux/fs/proc/base.c * * Copyright (C) 1991, 1992 Linus Torvalds * * proc base directory handling functions * * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. * Instead of using magical inumbers to determine the kind of object * we allocate and fill in-core inodes upon lookup. They don't even * go into icache. We cache the reference to task_struct upon lookup too. * Eventually it should become a filesystem in its own. We don't use the * rest of procfs anymore. * * * Changelog: * 17-Jan-2005 * Allan Bezerra * Bruna Moreira <bruna.moreira@indt.org.br> * Edjard Mota <edjard.mota@indt.org.br> * Ilias Biris <ilias.biris@indt.org.br> * Mauricio Lin <mauricio.lin@indt.org.br> * * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * * A new process specific entry (smaps) included in /proc. It shows the * size of rss for each memory area. The maps entry lacks information * about physical memory size (rss) for each mapped file, i.e., * rss information for executables and library files. * This additional information is useful for any tools that need to know * about physical memory consumption for a process specific library. * * Changelog: * 21-Feb-2005 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT * Pud inclusion in the page table walking. * * ChangeLog: * 10-Mar-2005 * 10LE Instituto Nokia de Tecnologia - INdT: * A better way to walks through the page table as suggested by Hugh Dickins. * * Simo Piiroinen <simo.piiroinen@nokia.com>: * Smaps information related to shared, private, clean and dirty pages. * * Paul Mundt <paul.mundt@nokia.com>: * Overall revision about smaps.
*/
/* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during * each system call not at open time. The reason is that most of * what we wish to check for permissions in /proc varies at runtime. * * The classic example of a problem is opening file descriptors * in /proc for a task before it execs a suid executable.
*/
staticint __init early_proc_mem_force_override(char *buf)
{ if (!buf) return -EINVAL;
/* * lookup_constant() defaults to proc_mem_force_override to preseve * the initial Kconfig choice in case an invalid param gets passed.
*/
proc_mem_force_override = lookup_constant(proc_mem_force_table,
buf, proc_mem_force_override);
/* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links.
*/ staticunsignedint __init pid_entry_nlink(conststruct pid_entry *entries, unsignedint n)
{ unsignedint i; unsignedint count;
count = 2; for (i = 0; i < n; ++i) { if (S_ISDIR(entries[i].mode))
++count;
}
return count;
}
staticint get_task_root(struct task_struct *task, struct path *root)
{ int result = -ENOENT;
task_lock(task); if (task->fs) {
get_fs_root(task->fs, root);
result = 0;
}
task_unlock(task); return result;
}
staticint proc_cwd_link(struct dentry *dentry, struct path *path)
{ struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT;
if (task) {
task_lock(task); if (task->fs) {
get_fs_pwd(task->fs, path);
result = 0;
}
task_unlock(task);
put_task_struct(task);
} return result;
}
staticint proc_root_link(struct dentry *dentry, struct path *path)
{ struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT;
if (task) {
result = get_task_root(task, path);
put_task_struct(task);
} return result;
}
/* * If the user used setproctitle(), we just get the string from * user space at arg_start, and limit it to a maximum of one page.
*/ static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
size_t count, unsignedlong pos, unsignedlong arg_start)
{ char *page; int ret, got;
if (pos >= PAGE_SIZE) return 0;
page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM;
ret = 0;
got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON); if (got > 0) { int len = strnlen(page, got);
/* Include the NUL character if it was found */ if (len < got)
len++;
if (len > pos) {
len -= pos; if (len > count)
len = count;
len -= copy_to_user(buf, page+pos, len); if (!len)
len = -EFAULT;
ret = len;
}
}
free_page((unsignedlong)page); return ret;
}
/* * We allow setproctitle() to overwrite the argument * strings, and overflow past the original end. But * only when it overflows into the environment area.
*/ if (env_start != arg_end || env_end < env_start)
env_start = env_end = arg_end;
len = env_end - arg_start;
/* We're not going to care if "*ppos" has high bits set */
pos = *ppos; if (pos >= len) return 0; if (count > len - pos)
count = len - pos; if (!count) return 0;
/* * Magical special case: if the argv[] end byte is not * zero, the user has overwritten it with setproctitle(3). * * Possible future enhancement: do this only once when * pos is 0, and set a flag in the 'struct file'.
*/ if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c) return get_mm_proctitle(mm, buf, count, pos, arg_start);
/* * For the non-setproctitle() case we limit things strictly * to the [arg_start, arg_end[ range.
*/
pos += arg_start; if (pos < arg_start || pos >= arg_end) return 0; if (count > arg_end - pos)
count = arg_end - pos;
page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM;
len = 0; while (count) { int got;
size_t size = min_t(size_t, PAGE_SIZE, count);
got = access_remote_vm(mm, pos, page, size, FOLL_ANON); if (got <= 0) break;
got -= copy_to_user(buf, page, got); if (unlikely(!got)) { if (!len)
len = -EFAULT; break;
}
pos += got;
buf += got;
len += got;
count -= got;
}
/* * The ability to racily run the kernel stack unwinder on a running task * and then observe the unwinder output is scary; while it is useful for * debugging kernel issues, it can also allow an attacker to leak kernel * stack contents. * Doing this in a manner that is at least safe from races would require * some work to ensure that the remote task can not be scheduled; and * even then, this would still expose the unwinder as local attack * surface. * Therefore, this interface is restricted to root.
*/ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) return -EACCES;
entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
GFP_KERNEL); if (!entries) return -ENOMEM;
err = lock_trace(task); if (!err) { unsignedint i, nr_entries;
badness = oom_badness(task, totalpages); /* * Special case OOM_SCORE_ADJ_MIN for all others scale the * badness value into [0, 2000] range which we have been * exporting for a long time so userspace might depend on it.
*/ if (badness != LONG_MIN)
points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;
/************************************************************************/ /* Here the fs part begins */ /************************************************************************/
/* permission checks */ staticbool proc_fd_access_allowed(struct inode *inode)
{ struct task_struct *task; bool allowed = false; /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information.
*/
task = get_proc_task(inode); if (task) {
allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
put_task_struct(task);
} return allowed;
}
int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr)
{ int error; struct inode *inode = d_inode(dentry);
if (attr->ia_valid & ATTR_MODE) return -EPERM;
error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error;
/* * May current process learn task's sched/cmdline info (for hide_pid_min=1) * or euid/egid (for hide_pid_min=2)?
*/ staticbool has_pid_permissions(struct proc_fs_info *fs_info, struct task_struct *task, enum proc_hidepid hide_pid_min)
{ /* * If 'hidpid' mount option is set force a ptrace check, * we indicate that we are using a filesystem syscall * by passing PTRACE_MODE_READ_FSCREDS
*/ if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
if (fs_info->hide_pid < hide_pid_min) returntrue; if (in_group_p(fs_info->pid_gid)) returntrue; return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
}
if (!has_perms) { if (fs_info->hide_pid == HIDEPID_INVISIBLE) { /* * Let's make getdents(), stat(), and open() * consistent with each other. If a process * may not stat() a file, it shouldn't be seen * in procfs at all.
*/ return -ENOENT;
}
task = get_proc_task(file_inode(file)); if (!task) return -ESRCH;
mutex_lock(&oom_adj_mutex); if (legacy) { if (oom_adj < task->signal->oom_score_adj &&
!capable(CAP_SYS_RESOURCE)) {
err = -EACCES; goto err_unlock;
} /* * /proc/pid/oom_adj is provided for legacy purposes, ask users to use * /proc/pid/oom_score_adj instead.
*/
pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
current->comm, task_pid_nr(current), task_pid_nr(task),
task_pid_nr(task));
} else { if ((short)oom_adj < task->signal->oom_score_adj_min &&
!capable(CAP_SYS_RESOURCE)) {
err = -EACCES; goto err_unlock;
}
}
/* * Make sure we will check other processes sharing the mm if this is * not vfrok which wants its own oom_score_adj. * pin the mm so it doesn't go away and get reused after task_unlock
*/ if (!task->vfork_done) { struct task_struct *p = find_lock_task_mm(task);
if (p) { if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
mm = p->mm;
mmgrab(mm);
}
task_unlock(p);
}
}
/* * /proc/pid/oom_adj exists solely for backwards compatibility with previous * kernels. The effective policy is defined by oom_score_adj, which has a * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. * Values written to oom_adj are simply mapped linearly to oom_score_adj. * Processes that become oom disabled via oom_adj will still be oom disabled * with this implementation. * * oom_adj cannot be removed since existing userspace binaries use it.
*/ static ssize_t oom_adj_write(struct file *file, constchar __user *buf,
size_t count, loff_t *ppos)
{ char buffer[PROC_NUMBUF] = {}; int oom_adj; int err;
/* * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum * value is always attainable.
*/ if (oom_adj == OOM_ADJUST_MAX)
oom_adj = OOM_SCORE_ADJ_MAX; else
oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
/* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL;
/* Slurp in the user data */
kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf);
/* Parse the user data */
ret = -EINVAL;
noffsets = 0; for (pos = kbuf; pos; pos = next_line) { struct proc_timens_offset *off = &offsets[noffsets]; char clock[10]; int err;
/* Find the end of line and ensure we don't look past it */
next_line = strchr(pos, '\n'); if (next_line) {
*next_line = '\0';
next_line++; if (*next_line == '\0')
next_line = NULL;
}
noffsets++; if (noffsets == ARRAY_SIZE(offsets)) { if (next_line)
count = next_line - kbuf; break;
}
}
ret = -ESRCH;
p = get_proc_task(inode); if (!p) goto out;
ret = proc_timens_set_offset(file, p, offsets, noffsets);
put_task_struct(p); if (ret) goto out;
void task_dump_owner(struct task_struct *task, umode_t mode,
kuid_t *ruid, kgid_t *rgid)
{ /* Depending on the state of dumpable compute who should own a * proc file for a task.
*/ conststruct cred *cred;
kuid_t uid;
kgid_t gid;
/* Default to the tasks effective ownership */
rcu_read_lock();
cred = __task_cred(task);
uid = cred->euid;
gid = cred->egid;
rcu_read_unlock();
/* * Before the /proc/pid/status file was created the only way to read * the effective uid of a /process was to stat /proc/pid. Reading * /proc/pid/status is slow enough that procps and other packages * kept stating /proc/pid. To keep the rules in /proc simple I have * made this apply to all per process world readable and executable * directories.
*/ if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) { struct mm_struct *mm;
task_lock(task);
mm = task->mm; /* Make non-dumpable tasks owned by some root */ if (mm) { if (get_dumpable(mm) != SUID_DUMP_USER) { struct user_namespace *user_ns = mm->user_ns;
uid = make_kuid(user_ns, 0); if (!uid_valid(uid))
uid = GLOBAL_ROOT_UID;
/* * Generating an inode and adding it into @pid->inodes, so that task will * invalidate inode's dentry before being released. * * This helper is used for creating dir-type entries under '/proc' and * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>' * can be released by invalidating '/proc/<tgid>' dentry. * In theory, dentries under '/proc/<tgid>/task' can also be released by * invalidating '/proc/<tgid>' dentry, we reserve it to handle single * thread exiting situation: Any one of threads should invalidate its * '/proc/<tgid>/task/<pid>' dentry before released.
*/ staticstruct inode *proc_pid_make_base_inode(struct super_block *sb, struct task_struct *task, umode_t mode)
{ struct inode *inode; struct proc_inode *ei; struct pid *pid;
inode = proc_pid_make_inode(sb, task, mode); if (!inode) return NULL;
/* Let proc_flush_pid find this directory inode */
ei = PROC_I(inode);
pid = ei->pid;
spin_lock(&pid->lock);
hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
spin_unlock(&pid->lock);
/* * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. *
*/ staticint pid_revalidate(struct inode *dir, conststruct qstr *name, struct dentry *dentry, unsignedint flags)
{ struct inode *inode; struct task_struct *task; int ret = 0;
int pid_delete_dentry(conststruct dentry *dentry)
{ /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately.
*/ return proc_inode_is_dead(d_inode(dentry));
}
/* * Fill a directory entry. * * If possible create the dcache entry and derive our inode number and * file type from dcache entry. * * Since all of the proc inode numbers are dynamically generated, the inode * numbers do not exist until the inode is cache. This means creating * the dcache entry in readdir is necessary to keep the inode numbers * reported by readdir in sync with the inode numbers reported * by stat.
*/ bool proc_fill_cache(struct file *file, struct dir_context *ctx, constchar *name, unsignedint len,
instantiate_t instantiate, struct task_struct *task, constvoid *ptr)
{ struct dentry *child, *dir = file->f_path.dentry; struct qstr qname = QSTR_INIT(name, len); struct inode *inode; unsigned type = DT_UNKNOWN;
ino_t ino = 1;
child = try_lookup_noperm(&qname, dir); if (!child) {
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) goto end_instantiate; if (d_in_lookup(child)) { struct dentry *res;
res = instantiate(child, task, ptr);
d_lookup_done(child); if (unlikely(res)) {
dput(child);
child = res; if (IS_ERR(child)) goto end_instantiate;
}
}
}
inode = d_inode(child);
ino = inode->i_ino;
type = inode->i_mode >> 12;
dput(child);
end_instantiate: return dir_emit(ctx, name, len, ino, type);
}
/* * dname_to_vma_addr - maps a dentry name into two unsigned longs * which represent vma start and end addresses.
*/ staticint dname_to_vma_addr(struct dentry *dentry, unsignedlong *start, unsignedlong *end)
{ constchar *str = dentry->d_name.name; unsignedlonglong sval, eval; unsignedint len;
if (str[0] == '0' && str[1] != '-') return -EINVAL;
len = _parse_integer(str, 16, &sval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (sval != (unsignedlong)sval) return -EINVAL;
str += len;
if (*str != '-') return -EINVAL;
str++;
if (str[0] == '0' && str[1]) return -EINVAL;
len = _parse_integer(str, 16, &eval); if (len & KSTRTOX_OVERFLOW) return -EINVAL; if (eval != (unsignedlong)eval) return -EINVAL;
str += len;
/* * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due * to concerns about how the symlinks may be used to bypass permissions on * ancestor directories in the path to the file in question.
*/ staticconstchar *
proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done)
{ if (!checkpoint_restore_ns_capable(&init_user_ns)) return ERR_PTR(-EPERM);
ret = -ENOENT;
task = get_proc_task(file_inode(file)); if (!task) goto out;
ret = -EACCES; if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task;
ret = 0; if (!dir_emit_dots(file, ctx)) goto out_put_task;
mm = get_task_mm(task); if (!mm) goto out_put_task;
ret = mmap_read_lock_killable(mm); if (ret) {
mmput(mm); goto out_put_task;
}
nr_files = 0;
/* * We need two passes here: * * 1) Collect vmas of mapped files with mmap_lock taken * 2) Release mmap_lock and instantiate entries * * otherwise we get lockdep complained, since filldir() * routine might require mmap_lock taken in might_fault().
*/
pos = 2;
vma_iter_init(&vmi, mm, 0);
for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (++pos <= ctx->pos) continue;
p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL); if (!p) {
ret = -ENOMEM;
mmap_read_unlock(mm);
mmput(mm); goto out_put_task;
}
inode = proc_pid_make_inode(dentry->d_sb, task, p->mode); if (!inode) return ERR_PTR(-ENOENT);
ei = PROC_I(inode); if (S_ISDIR(inode->i_mode))
set_nlink(inode, 2); /* Use getattr to fix if necessary */ if (p->iop)
inode->i_op = p->iop; if (p->fop)
inode->i_fop = p->fop;
ei->op = p->op;
pid_update_inode(task, inode); return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
/* * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons.
*/ for (; p < end; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) {
res = proc_pident_instantiate(dentry, task, p); break;
}
}
put_task_struct(task);
out_no_task: return res;
}
/* A task may only write when it was the opener. */ if (file->private_data != current->mm) return -EPERM;
rcu_read_lock();
task = pid_task(proc_pid(inode), PIDTYPE_PID); if (!task) {
rcu_read_unlock(); return -ESRCH;
} /* A task may only write its own attributes. */ if (current != task) {
rcu_read_unlock(); return -EACCES;
} /* Prevent changes to overridden credentials. */
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.28 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.