// SPDX-License-Identifier: GPL-2.0-or-later /* * NET4: Implementation of BSD Unix domain sockets. * * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> * * Fixes: * Linus Torvalds : Assorted bug cures. * Niibe Yutaka : async I/O support. * Carsten Paeth : PF_UNIX check, address fixes. * Alan Cox : Limit size of allocated blocks. * Alan Cox : Fixed the stupid socketpair bug. * Alan Cox : BSD compatibility fine tuning. * Alan Cox : Fixed a bug in connect when interrupted. * Alan Cox : Sorted out a proper draft version of * file descriptor passing hacked up from * Mike Shaver's work. * Marty Leisner : Fixes to fd passing * Nick Nevin : recvmsg bugfix. * Alan Cox : Started proper garbage collector * Heiko EiBfeldt : Missing verify_area check * Alan Cox : Started POSIXisms * Andreas Schwab : Replace inode by dentry for proper * reference counting * Kirk Petersen : Made this a module * Christoph Rohland : Elegant non-blocking accept/connect algorithm. * Lots of bug fixes. * Alexey Kuznetosv : Repaired (I hope) bugs introduces * by above two patches. * Andrea Arcangeli : If possible we block in connect(2) * if the max backlog of the listen socket * is been reached. This won't break * old apps and it will avoid huge amount * of socks hashed (this for unix_gc() * performances reasons). * Security fix that limits the max * number of socks to 2*max_files and * the number of skb queueable in the * dgram receiver. * Artur Skawina : Hash function optimizations * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) * Malcolm Beattie : Set peercred for socketpair * Michal Ostrowski : Module initialization cleanup. * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, * the core infrastructure is doing that * for all net proto families now (2.5.69+) * * Known differences from reference BSD that was tested: * * [TO FIX] * ECONNREFUSED is not returned from one end of a connected() socket to the * other the moment one end closes. * fstat() doesn't return st_dev=0, and give the blksize as high water mark * and a fake inode identifier (nor the BSD first socket fstat twice bug). * [NOT TO FIX] * accept() returns a path name even if the connecting socket has closed * in the meantime (BSD loses the path and gives up). * accept() returns 0 length path for an unbound connector. BSD returns 16 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) * socketpair(...SOCK_RAW..) doesn't panic the kernel. * BSD af_unix apparently has connect forgetting to block properly. * (need to check this with the POSIX spec in detail) * * Differences from 2.0.0-11-... (ANK) * Bug fixes and improvements. * - client shutdown killed server socket. * - removed all useless cli/sti pairs. * * Semantic changes/extensions. * - generic control message passing. * - SCM_CREDENTIALS control message. * - "Abstract" (not FS based) socket bindings. * Abstract names are sequences of bytes (not zero terminated) * started by 0, so that this name space does not intersect * with BSD names.
*/
/* SMP locking strategy: * hash table is protected with spinlock. * each socket state is protected by separate spinlock.
*/ #ifdef CONFIG_PROVE_LOCKING #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r)))
a = container_of(_a, struct unix_sock, lock.dep_map);
b = container_of(_b, struct unix_sock, lock.dep_map);
if (a->sk.sk_state == TCP_LISTEN) { /* unix_stream_connect(): Before the 2nd unix_state_lock(), * * 1. a is TCP_LISTEN. * 2. b is not a. * 3. concurrent connect(b -> a) must fail. * * Except for 2. & 3., the b's state can be any possible * value due to concurrent connect() or listen(). * * 2. is detected in debug_spin_lock_before(), and 3. cannot * be expressed as lock_cmp_fn.
*/ switch (b->sk.sk_state) { case TCP_CLOSE: case TCP_ESTABLISHED: case TCP_LISTEN: return -1; default: /* Invalid case. */ return 0;
}
}
/* Should never happen. Just to be symmetric. */ if (b->sk.sk_state == TCP_LISTEN) { switch (b->sk.sk_state) { case TCP_CLOSE: case TCP_ESTABLISHED: return 1; default: return 0;
}
}
staticinlinevoid unix_release_addr(struct unix_address *addr)
{ if (refcount_dec_and_test(&addr->refcnt))
kfree(addr);
}
/* * Check unix socket name: * - should be not zero length. * - if started by not zero, should be NULL terminated (FS object) * - if started by zero, it is abstract name.
*/
staticint unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
{ if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
addr_len > sizeof(*sunaddr)) return -EINVAL;
if (sunaddr->sun_family != AF_UNIX) return -EINVAL;
return 0;
}
staticint unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
{ struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; short offset = offsetof(struct sockaddr_storage, __data);
/* This may look like an off by one error but it is a bit more * subtle. 108 is the longest valid AF_UNIX path for a binding. * sun_path[108] doesn't as such exist. However in kernel space * we are guaranteed that it is a valid memory location in our * kernel address buffer because syscall functions always pass * a pointer of struct sockaddr_storage which has a bigger buffer * than 108. Also, we must terminate sun_path for strlen() in * getname_kernel().
*/
addr->__data[addr_len - offset] = 0;
/* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() * know the actual buffer.
*/ return strlen(addr->__data) + offset + 1;
}
if (u->addr->len == len &&
!memcmp(u->addr->name, sunname, len)) return s;
} return NULL;
}
staticinlinestruct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsignedint hash)
{ struct sock *s;
spin_lock(&net->unx.table.locks[hash]);
s = __unix_find_socket_byname(net, sunname, len, hash); if (s)
sock_hold(s);
spin_unlock(&net->unx.table.locks[hash]); return s;
}
/* Support code for asymmetrically connected dgram sockets * * If a datagram socket is connected to a socket not itself connected * to the first socket (eg, /dev/log), clients may only enqueue more * messages if the present receive queue of the server socket is not * "too large". This means there's a second writeability condition * poll and sendmsg need to test. The dgram recv code will do a wake * up on the peer_wait wait queue of a socket upon reception of a * datagram which needs to be propagated to sleeping would-be writers * since these might not have sent anything so far. This can't be * accomplished via poll_wait because the lifetime of the server * socket might be less than that of its clients if these break their * association with it or if the server socket is closed while clients * are still connected to it and there's no way to inform "a polling * implementation" that it should let go of a certain wait queue * * In order to propagate a wake up, a wait_queue_entry_t of the client * socket is enqueued on the peer_wait queue of the server socket * whose wake function does a wake_up on the ordinary client socket * wait queue. This connection is established whenever a write (or * poll for write) hit the flow control condition and broken when the * association to the server socket is dissolved or after a wake up * was relayed.
*/
/* relaying can only happen while the wq still exists */
u_sleep = sk_sleep(&u->sk); if (u_sleep)
wake_up_interruptible_poll(u_sleep, key_to_poll(key));
/* If other is SOCK_DEAD, we want to make sure we signal * POLLOUT, such that a subsequent write() can get a * -ECONNREFUSED. Otherwise, if we haven't queued any skbs * to other and its full, we will hang waiting for POLLOUT.
*/ if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) return 1;
if (connected)
unix_dgram_peer_wake_disconnect(sk, other);
/* When dgram socket disconnects (or changes its peer), we clear its receive * queue of packets arrived from previous peer. First, it allows to do * flow control based only on wmem_alloc; second, sk connected to peer
* may receive messages only from that peer. */ staticvoid unix_dgram_disconnected(struct sock *sk, struct sock *other)
{ if (!skb_queue_empty(&sk->sk_receive_queue)) {
skb_queue_purge_reason(&sk->sk_receive_queue,
SKB_DROP_REASON_UNIX_DISCONNECT);
/* If one link of bidirectional dgram pipe is disconnected, * we signal error. Messages are lost. Do not make this, * when peer was not connected to us.
*/ if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
WRITE_ONCE(other->sk_err, ECONNRESET);
sk_error_report(other);
}
}
}
if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
#if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb && !unix_skb_len(skb))
skb = skb_peek_next(skb, &sk->sk_receive_queue); #endif
unix_state_lock(skpair); /* No more writes */
WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); if (skb || embrion)
WRITE_ONCE(skpair->sk_err, ECONNRESET);
unix_state_unlock(skpair);
skpair->sk_state_change(skpair);
sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
}
unix_dgram_peer_wake_disconnect(sk, skpair);
sock_put(skpair); /* It may now die */
}
/* Try to flush out this socket. Throw out buffers at least */
while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (state == TCP_LISTEN)
unix_release_sock(skb->sk, 1);
/* passed fds are erased in the kfree_skb hook */
kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
}
if (path.dentry)
path_put(&path);
sock_put(sk);
/* ---- Socket is dead now and most probably destroyed ---- */
/* * Fixme: BSD difference: In BSD all sockets connected to us get * ECONNRESET and we die on the spot. In Linux we behave * like files and pipes do and wait for the last * dereference. * * Can't we simply set sock->err? * * What the above comment does talk about? --ANK(980817)
*/
if (READ_ONCE(unix_tot_inflight))
unix_gc(); /* Garbage collect fds */
}
if (sk) {
s_state = READ_ONCE(sk->sk_state);
u = unix_sk(sk);
/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. * SOCK_DGRAM is ordinary. So, no lock is needed.
*/ if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
nr_fds = atomic_read(&u->scm_stat.nr_fds); elseif (s_state == TCP_LISTEN)
nr_fds = unix_count_nr_fds(sk);
staticvoid unix_close(struct sock *sk, long timeout)
{ /* Nothing to do here, unix socket does not need a ->close(). * This is merely for sockmap.
*/
}
staticbool unix_bpf_bypass_getsockopt(int level, int optname)
{ if (level == SOL_SOCKET) { switch (optname) { case SO_PEERPIDFD: returntrue; default: returnfalse;
}
}
staticint unix_create(struct net *net, struct socket *sock, int protocol, int kern)
{ struct sock *sk;
if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT;
sock->state = SS_UNCONNECTED;
switch (sock->type) { case SOCK_STREAM:
set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags);
sock->ops = &unix_stream_ops; break; /* * Believe it or not BSD has AF_UNIX, SOCK_RAW though * nothing uses it.
*/ case SOCK_RAW:
sock->type = SOCK_DGRAM;
fallthrough; case SOCK_DGRAM:
sock->ops = &unix_dgram_ops; break; case SOCK_SEQPACKET:
sock->ops = &unix_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT;
}
sk = unix_create1(net, sock, kern, sock->type); if (IS_ERR(sk)) return PTR_ERR(sk);
/* * Get the parent directory, calculate the hash for last * component.
*/
dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) {
err = PTR_ERR(dentry); goto out;
}
/* * All right, let's create it.
*/
idmap = mnt_idmap(parent.mnt);
err = security_path_mknod(&parent, dentry, mode, 0); if (!err)
err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); if (err) goto out_path;
err = mutex_lock_interruptible(&u->bindlock); if (err) goto out_unlink; if (u->addr) goto out_unlock;
/* copy address information from listening to new sock * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found * otheru in hash under its lock. Insertion into the * hash chain we'd found it in had been done in an * earlier critical area protected by the chain's lock, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * * Using smp_store_release() here to set newu->addr * is enough to make those stores, as well as stores * to newu->path visible to anyone who gets newu->addr * by smp_load_acquire(). IOW, the same warranties * as for unix_sock instances bound in unix_bind() or * in unix_autobind().
*/ if (otheru->path.dentry) {
path_get(&otheru->path);
newu->path = otheru->path;
}
refcount_inc(&otheru->addr->refcnt);
smp_store_release(&newu->addr, otheru->addr);
smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
unix_peer(sk) = newsk;
unix_state_unlock(sk);
/* take ten and send info to listening sock */
spin_lock(&other->sk_receive_queue.lock);
__skb_queue_tail(&other->sk_receive_queue, skb);
spin_unlock(&other->sk_receive_queue.lock);
unix_state_unlock(other);
other->sk_data_ready(other);
sock_put(other); return 0;
/* The "user->unix_inflight" variable is protected by the garbage * collection lock, and we just read it locklessly here. If you go * over the limit, there might be a tiny race in actually noticing * it across threads. Tough.
*/ staticinlinebool too_many_unix_fds(struct task_struct *p)
{ struct user_struct *user = current_user();
/** * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed. * @skb: skb to attach creds to. * @sk: Sender sock. * @other: Receiver sock. * * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. * * Context: May sleep. * Return: On success zero, on error a negative error code is returned.
*/ staticint unix_maybe_add_creds(struct sk_buff *skb, conststruct sock *sk, conststruct sock *other)
{ if (UNIXCB(skb).pid) return 0;
if (unix_may_passcred(sk) || unix_may_passcred(other) ||
!other->sk_socket) { struct pid *pid; int err;
pid = task_tgid(current);
err = pidfs_register_pid(pid); if (unlikely(err)) return err;
if (!unix_may_send(sk, other)) {
err = -EPERM; goto out_unlock;
}
if (unlikely(sock_flag(other, SOCK_DEAD))) { /* Check with 1003.1g - what should datagram error */
unix_state_unlock(other);
if (sk->sk_type == SOCK_SEQPACKET) { /* We are here only when racing with unix_release_sock() * is clearing @other. Never change state to TCP_CLOSE * unlike SOCK_DGRAM wants.
*/
err = -EPIPE; goto out_sock_put;
}
if (!sk_locked)
unix_state_lock(sk);
if (unix_peer(sk) == other) {
unix_peer(sk) = NULL;
unix_dgram_peer_wake_disconnect_wakeup(sk, other);
if (sk->sk_type != SOCK_SEQPACKET) {
err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock;
}
/* other == sk && unix_peer(other) != sk if * - unix_peer(sk) == NULL, destination address bound to sk * - unix_peer(sk) == sk by time of get but disconnected before lock
*/ if (other != sk &&
unlikely(unix_peer(other) != sk &&
unix_recvq_full_lockless(other))) { if (timeo) {
timeo = unix_wait_for_peer(other, timeo);
err = sock_intr_errno(timeo); if (signal_pending(current)) goto out_sock_put;
goto restart;
}
if (!sk_locked) {
unix_state_unlock(other);
unix_state_double_lock(sk, other);
}
if (unix_peer(sk) != other ||
unix_dgram_peer_wake_me(sk, other)) {
err = -EAGAIN;
sk_locked = 1; goto out_unlock;
}
if (!sk_locked) {
sk_locked = 1; goto restart_locked;
}
}
if (unlikely(sk_locked))
unix_state_unlock(sk);
if (sock_flag(other, SOCK_RCVTSTAMP))
__net_timestamp(skb);
/* We use paged skbs for stream sockets, and limit occupancy to 32768 * bytes, and a minimum of a full page.
*/ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free;
if (sock_flag(sk, SOCK_RCVTSTAMP))
__sock_recv_timestamp(msg, sk, skb);
memset(&scm, 0, sizeof(scm));
unix_skb_to_scm(skb, &scm);
if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp)
unix_detach_fds(&scm, skb);
sk_peek_offset_bwd(sk, skb->len);
} else { /* It is questionable: on PEEK we could: - do not return fds - good, but too simple 8) - return fds, and do not return them on read (old strategy, apparently wrong) - clone fds (I chose it for now, it is the most universal solution)
POSIX 1003.1g does not actually define this clearly at all. POSIX 1003.1g doesn't define a lot of things clearly however!
#ifdef CONFIG_BPF_SYSCALL conststruct proto *prot = READ_ONCE(sk->sk_prot);
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.74Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.