// SPDX-License-Identifier: GPL-2.0-or-later /* * NET4: Implementation of BSD Unix domain sockets. * * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> * * Fixes: * Linus Torvalds : Assorted bug cures. * Niibe Yutaka : async I/O support. * Carsten Paeth : PF_UNIX check, address fixes. * Alan Cox : Limit size of allocated blocks. * Alan Cox : Fixed the stupid socketpair bug. * Alan Cox : BSD compatibility fine tuning. * Alan Cox : Fixed a bug in connect when interrupted. * Alan Cox : Sorted out a proper draft version of * file descriptor passing hacked up from * Mike Shaver's work. * Marty Leisner : Fixes to fd passing * Nick Nevin : recvmsg bugfix. * Alan Cox : Started proper garbage collector * Heiko EiBfeldt : Missing verify_area check * Alan Cox : Started POSIXisms * Andreas Schwab : Replace inode by dentry for proper * reference counting * Kirk Petersen : Made this a module * Christoph Rohland : Elegant non-blocking accept/connect algorithm. * Lots of bug fixes. * Alexey Kuznetosv : Repaired (I hope) bugs introduces * by above two patches. * Andrea Arcangeli : If possible we block in connect(2) * if the max backlog of the listen socket * is been reached. This won't break * old apps and it will avoid huge amount * of socks hashed (this for unix_gc() * performances reasons). * Security fix that limits the max * number of socks to 2*max_files and * the number of skb queueable in the * dgram receiver. * Artur Skawina : Hash function optimizations * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) * Malcolm Beattie : Set peercred for socketpair * Michal Ostrowski : Module initialization cleanup. * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, * the core infrastructure is doing that * for all net proto families now (2.5.69+) * * Known differences from reference BSD that was tested: * * [TO FIX] * ECONNREFUSED is not returned from one end of a connected() socket to the * other the moment one end closes. * fstat() doesn't return st_dev=0, and give the blksize as high water mark * and a fake inode identifier (nor the BSD first socket fstat twice bug). * [NOT TO FIX] * accept() returns a path name even if the connecting socket has closed * in the meantime (BSD loses the path and gives up). * accept() returns 0 length path for an unbound connector. BSD returns 16 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) * socketpair(...SOCK_RAW..) doesn't panic the kernel. * BSD af_unix apparently has connect forgetting to block properly. * (need to check this with the POSIX spec in detail) * * Differences from 2.0.0-11-... (ANK) * Bug fixes and improvements. * - client shutdown killed server socket. * - removed all useless cli/sti pairs. * * Semantic changes/extensions. * - generic control message passing. * - SCM_CREDENTIALS control message. * - "Abstract" (not FS based) socket bindings. * Abstract names are sequences of bytes (not zero terminated) * started by 0, so that this name space does not intersect * with BSD names.
*/
/* SMP locking strategy: * hash table is protected with spinlock. * each socket state is protected by separate spinlock.
*/ #ifdef CONFIG_PROVE_LOCKING #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r)))
a = container_of(_a, struct unix_sock, lock.dep_map);
b = container_of(_b, struct unix_sock, lock.dep_map);
if (a->sk.sk_state == TCP_LISTEN) { /* unix_stream_connect(): Before the 2nd unix_state_lock(), * * 1. a is TCP_LISTEN. * 2. b is not a. * 3. concurrent connect(b -> a) must fail. * * Except for 2. & 3., the b's state can be any possible * value due to concurrent connect() or listen(). * * 2. is detected in debug_spin_lock_before(), and 3. cannot * be expressed as lock_cmp_fn.
*/ switch (b->sk.sk_state) { case TCP_CLOSE: case TCP_ESTABLISHED: case TCP_LISTEN: return -1; default: /* Invalid case. */ return 0;
}
}
/* Should never happen. Just to be symmetric. */ if (b->sk.sk_state == TCP_LISTEN) { switch (b->sk.sk_state) { case TCP_CLOSE: case TCP_ESTABLISHED: return 1; default: return 0;
}
}
staticinlinevoid unix_release_addr(struct unix_address *addr)
{ if (refcount_dec_and_test(&addr->refcnt))
kfree(addr);
}
/* * Check unix socket name: * - should be not zero length. * - if started by not zero, should be NULL terminated (FS object) * - if started by zero, it is abstract name.
*/
staticint unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
{ if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
addr_len > sizeof(*sunaddr)) return -EINVAL;
if (sunaddr->sun_family != AF_UNIX) return -EINVAL;
return 0;
}
staticint unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
{ struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; short offset = offsetof(struct sockaddr_storage, __data);
/* This may look like an off by one error but it is a bit more * subtle. 108 is the longest valid AF_UNIX path for a binding. * sun_path[108] doesn't as such exist. However in kernel space * we are guaranteed that it is a valid memory location in our * kernel address buffer because syscall functions always pass * a pointer of struct sockaddr_storage which has a bigger buffer * than 108. Also, we must terminate sun_path for strlen() in * getname_kernel().
*/
addr->__data[addr_len - offset] = 0;
/* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() * know the actual buffer.
*/ return strlen(addr->__data) + offset + 1;
}
if (u->addr->len == len &&
!memcmp(u->addr->name, sunname, len)) return s;
} return NULL;
}
staticinlinestruct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsignedint hash)
{ struct sock *s;
spin_lock(&net->unx.table.locks[hash]);
s = __unix_find_socket_byname(net, sunname, len, hash); if (s)
sock_hold(s);
spin_unlock(&net->unx.table.locks[hash]); return s;
}
/* Support code for asymmetrically connected dgram sockets * * If a datagram socket is connected to a socket not itself connected * to the first socket (eg, /dev/log), clients may only enqueue more * messages if the present receive queue of the server socket is not * "too large". This means there's a second writeability condition * poll and sendmsg need to test. The dgram recv code will do a wake * up on the peer_wait wait queue of a socket upon reception of a * datagram which needs to be propagated to sleeping would-be writers * since these might not have sent anything so far. This can't be * accomplished via poll_wait because the lifetime of the server * socket might be less than that of its clients if these break their * association with it or if the server socket is closed while clients * are still connected to it and there's no way to inform "a polling * implementation" that it should let go of a certain wait queue * * In order to propagate a wake up, a wait_queue_entry_t of the client * socket is enqueued on the peer_wait queue of the server socket * whose wake function does a wake_up on the ordinary client socket * wait queue. This connection is established whenever a write (or * poll for write) hit the flow control condition and broken when the * association to the server socket is dissolved or after a wake up * was relayed.
*/
/* relaying can only happen while the wq still exists */
u_sleep = sk_sleep(&u->sk); if (u_sleep)
wake_up_interruptible_poll(u_sleep, key_to_poll(key));
/* If other is SOCK_DEAD, we want to make sure we signal * POLLOUT, such that a subsequent write() can get a * -ECONNREFUSED. Otherwise, if we haven't queued any skbs * to other and its full, we will hang waiting for POLLOUT.
*/ if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) return 1;
if (connected)
unix_dgram_peer_wake_disconnect(sk, other);
/* When dgram socket disconnects (or changes its peer), we clear its receive * queue of packets arrived from previous peer. First, it allows to do * flow control based only on wmem_alloc; second, sk connected to peer
* may receive messages only from that peer. */ staticvoid unix_dgram_disconnected(struct sock *sk, struct sock *other)
{ if (!skb_queue_empty(&sk->sk_receive_queue)) {
skb_queue_purge_reason(&sk->sk_receive_queue,
SKB_DROP_REASON_UNIX_DISCONNECT);
/* If one link of bidirectional dgram pipe is disconnected, * we signal error. Messages are lost. Do not make this, * when peer was not connected to us.
*/ if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
WRITE_ONCE(other->sk_err, ECONNRESET);
sk_error_report(other);
}
}
}
if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
#if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb && !unix_skb_len(skb))
skb = skb_peek_next(skb, &sk->sk_receive_queue); #endif
unix_state_lock(skpair); /* No more writes */
WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); if (skb || embrion)
WRITE_ONCE(skpair->sk_err, ECONNRESET);
unix_state_unlock(skpair);
skpair->sk_state_change(skpair);
sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
}
unix_dgram_peer_wake_disconnect(sk, skpair);
sock_put(skpair); /* It may now die */
}
/* Try to flush out this socket. Throw out buffers at least */
while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (state == TCP_LISTEN)
unix_release_sock(skb->sk, 1);
/* passed fds are erased in the kfree_skb hook */
kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
}
if (path.dentry)
path_put(&path);
sock_put(sk);
/* ---- Socket is dead now and most probably destroyed ---- */
/* * Fixme: BSD difference: In BSD all sockets connected to us get * ECONNRESET and we die on the spot. In Linux we behave * like files and pipes do and wait for the last * dereference. * * Can't we simply set sock->err? * * What the above comment does talk about? --ANK(980817)
*/
if (READ_ONCE(unix_tot_inflight))
unix_gc(); /* Garbage collect fds */
}
if (sk) {
s_state = READ_ONCE(sk->sk_state);
u = unix_sk(sk);
/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. * SOCK_DGRAM is ordinary. So, no lock is needed.
*/ if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
nr_fds = atomic_read(&u->scm_stat.nr_fds); elseif (s_state == TCP_LISTEN)
nr_fds = unix_count_nr_fds(sk);
staticvoid unix_close(struct sock *sk, long timeout)
{ /* Nothing to do here, unix socket does not need a ->close(). * This is merely for sockmap.
*/
}
staticbool unix_bpf_bypass_getsockopt(int level, int optname)
{ if (level == SOL_SOCKET) { switch (optname) { case SO_PEERPIDFD: returntrue; default: returnfalse;
}
}
staticint unix_create(struct net *net, struct socket *sock, int protocol, int kern)
{ struct sock *sk;
if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT;
sock->state = SS_UNCONNECTED;
switch (sock->type) { case SOCK_STREAM:
set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags);
sock->ops = &unix_stream_ops; break; /* * Believe it or not BSD has AF_UNIX, SOCK_RAW though * nothing uses it.
*/ case SOCK_RAW:
sock->type = SOCK_DGRAM;
fallthrough; case SOCK_DGRAM:
sock->ops = &unix_dgram_ops; break; case SOCK_SEQPACKET:
sock->ops = &unix_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT;
}
sk = unix_create1(net, sock, kern, sock->type); if (IS_ERR(sk)) return PTR_ERR(sk);
/* * Get the parent directory, calculate the hash for last * component.
*/
dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) {
err = PTR_ERR(dentry); goto out;
}
/* * All right, let's create it.
*/
idmap = mnt_idmap(parent.mnt);
err = security_path_mknod(&parent, dentry, mode, 0); if (!err)
err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); if (err) goto out_path;
err = mutex_lock_interruptible(&u->bindlock); if (err) goto out_unlink; if (u->addr) goto out_unlock;
/* copy address information from listening to new sock * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found * otheru in hash under its lock. Insertion into the * hash chain we'd found it in had been done in an * earlier critical area protected by the chain's lock, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * * Using smp_store_release() here to set newu->addr * is enough to make those stores, as well as stores * to newu->path visible to anyone who gets newu->addr * by smp_load_acquire(). IOW, the same warranties * as for unix_sock instances bound in unix_bind() or * in unix_autobind().
*/ if (otheru->path.dentry) {
path_get(&otheru->path);
newu->path = otheru->path;
}
refcount_inc(&otheru->addr->refcnt);
smp_store_release(&newu->addr, otheru->addr);
smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
unix_peer(sk) = newsk;
unix_state_unlock(sk);
/* take ten and send info to listening sock */
spin_lock(&other->sk_receive_queue.lock);
__skb_queue_tail(&other->sk_receive_queue, skb);
spin_unlock(&other->sk_receive_queue.lock);
unix_state_unlock(other);
other->sk_data_ready(other);
sock_put(other); return 0;
/* The "user->unix_inflight" variable is protected by the garbage * collection lock, and we just read it locklessly here. If you go * over the limit, there might be a tiny race in actually noticing * it across threads. Tough.
*/ staticinlinebool too_many_unix_fds(struct task_struct *p)
{ struct user_struct *user = current_user();
/** * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed. * @skb: skb to attach creds to. * @sk: Sender sock. * @other: Receiver sock. * * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. * * Context: May sleep. * Return: On success zero, on error a negative error code is returned.
*/ staticint unix_maybe_add_creds(struct sk_buff *skb, conststruct sock *sk, conststruct sock *other)
{ if (UNIXCB(skb).pid) return 0;
if (unix_may_passcred(sk) || unix_may_passcred(other) ||
!other->sk_socket) { struct pid *pid; int err;
pid = task_tgid(current);
err = pidfs_register_pid(pid); if (unlikely(err)) return err;
if (!unix_may_send(sk, other)) {
err = -EPERM; goto out_unlock;
}
if (unlikely(sock_flag(other, SOCK_DEAD))) { /* Check with 1003.1g - what should datagram error */
unix_state_unlock(other);
if (sk->sk_type == SOCK_SEQPACKET) { /* We are here only when racing with unix_release_sock() * is clearing @other. Never change state to TCP_CLOSE * unlike SOCK_DGRAM wants.
*/
err = -EPIPE; goto out_sock_put;
}
if (!sk_locked)
unix_state_lock(sk);
if (unix_peer(sk) == other) {
unix_peer(sk) = NULL;
unix_dgram_peer_wake_disconnect_wakeup(sk, other);
if (sk->sk_type != SOCK_SEQPACKET) {
err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) goto out_unlock;
}
/* other == sk && unix_peer(other) != sk if * - unix_peer(sk) == NULL, destination address bound to sk * - unix_peer(sk) == sk by time of get but disconnected before lock
*/ if (other != sk &&
unlikely(unix_peer(other) != sk &&
unix_recvq_full_lockless(other))) { if (timeo) {
timeo = unix_wait_for_peer(other, timeo);
err = sock_intr_errno(timeo); if (signal_pending(current)) goto out_sock_put;
goto restart;
}
if (!sk_locked) {
unix_state_unlock(other);
unix_state_double_lock(sk, other);
}
if (unix_peer(sk) != other ||
unix_dgram_peer_wake_me(sk, other)) {
err = -EAGAIN;
sk_locked = 1; goto out_unlock;
}
if (!sk_locked) {
sk_locked = 1; goto restart_locked;
}
}
if (unlikely(sk_locked))
unix_state_unlock(sk);
if (sock_flag(other, SOCK_RCVTSTAMP))
__net_timestamp(skb);
/* We use paged skbs for stream sockets, and limit occupancy to 32768 * bytes, and a minimum of a full page.
*/ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free;
if (sock_flag(sk, SOCK_RCVTSTAMP))
__sock_recv_timestamp(msg, sk, skb);
memset(&scm, 0, sizeof(scm));
unix_skb_to_scm(skb, &scm);
if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp)
unix_detach_fds(&scm, skb);
sk_peek_offset_bwd(sk, skb->len);
} else { /* It is questionable: on PEEK we could: - do not return fds - good, but too simple 8) - return fds, and do not return them on read (old strategy, apparently wrong) - clone fds (I chose it for now, it is the most universal solution)
POSIX 1003.1g does not actually define this clearly at all. POSIX 1003.1g doesn't define a lot of things clearly however!
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.