// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Fixes: * Alan Cox : Numerous verify_area() problems * Alan Cox : Connecting on a connecting socket * now returns an error for tcp. * Alan Cox : sock->protocol is set correctly. * and is not sometimes left as 0. * Alan Cox : connect handles icmp errors on a * connect properly. Unfortunately there * is a restart syscall nasty there. I * can't match BSD without hacking the C * library. Ideas urgently sought! * Alan Cox : Disallow bind() to addresses that are * not ours - especially broadcast ones!! * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, * instead they leave that for the DESTROY timer. * Alan Cox : Clean up error flag in accept * Alan Cox : TCP ack handling is buggy, the DESTROY timer * was buggy. Put a remove_sock() in the handler * for memory when we hit 0. Also altered the timer * code. The ACK stuff can wait and needs major * TCP layer surgery. * Alan Cox : Fixed TCP ack bug, removed remove sock * and fixed timer/inet_bh race. * Alan Cox : Added zapped flag for TCP * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... * Rick Sladkey : Relaxed UDP rules for matching packets. * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support * Pauline Middelink : identd support * Alan Cox : Fixed connect() taking signals I think. * Alan Cox : SO_LINGER supported * Alan Cox : Error reporting fixes * Anonymous : inet_create tidied up (sk->reuse setting) * Alan Cox : inet sockets don't set sk->type! * Alan Cox : Split socket option code * Alan Cox : Callbacks * Alan Cox : Nagle flag for Charles & Johannes stuff * Alex : Removed restriction on inet fioctl * Alan Cox : Splitting INET from NET core * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code * Alan Cox : Split IP from generic code * Alan Cox : New kfree_skbmem() * Alan Cox : Make SO_DEBUG superuser only. * Alan Cox : Allow anyone to clear SO_DEBUG * (compatibility fix) * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. * Alan Cox : Allocator for a socket is settable. * Alan Cox : SO_ERROR includes soft errors. * Alan Cox : Allow NULL arguments on some SO_ opts * Alan Cox : Generic socket allocation to make hooks * easier (suggested by Craig Metz). * Michael Pall : SO_ERROR returns positive errno again * Steve Whitehouse: Added default destructor to free * protocol private data. * Steve Whitehouse: Added various other default routines * common to several socket families. * Chris Evans : Call suser() check last on F_SETOWN * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() * Andi Kleen : Fix write_space callback * Chris Evans : Security fixes - signedness again * Arnaldo C. Melo : cleanups, use skb_queue_purge * * To Fix:
*/
/** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in the user * namespace @user_ns.
*/ bool sk_ns_capable(conststruct sock *sk, struct user_namespace *user_ns, int cap)
{ return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(sk_ns_capable);
/** * sk_capable - Socket global capability test * @sk: Socket to use a capability on or through * @cap: The global capability to use * * Test to see if the opener of the socket had when the socket was * created and the current process has the capability @cap in all user * namespaces.
*/ bool sk_capable(conststruct sock *sk, int cap)
{ return sk_ns_capable(sk, &init_user_ns, cap);
}
EXPORT_SYMBOL(sk_capable);
/** * sk_net_capable - Network namespace socket capability test * @sk: Socket to use a capability on or through * @cap: The capability to use * * Test to see if the opener of the socket had when the socket was created * and the current process has the capability @cap over the network namespace * the socket is a member of.
*/ bool sk_net_capable(conststruct sock *sk, int cap)
{ return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
}
EXPORT_SYMBOL(sk_net_capable);
/* * Each address family might have different locking rules, so we have * one slock key per address family and separate keys for internal and * userspace sockets.
*/ staticstruct lock_class_key af_family_keys[AF_MAX]; staticstruct lock_class_key af_family_kern_keys[AF_MAX]; staticstruct lock_class_key af_family_slock_keys[AF_MAX]; staticstruct lock_class_key af_family_kern_slock_keys[AF_MAX];
/* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket * locks is fast):
*/
#define _sock_locks(x) \
x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
x "27" , x "28" , x "AF_CAN" , \
x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
x "AF_MCTP" , \
x "AF_MAX"
/* * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key:
*/ staticstruct lock_class_key af_callback_keys[AF_MAX]; staticstruct lock_class_key af_rlock_keys[AF_MAX]; staticstruct lock_class_key af_wlock_keys[AF_MAX]; staticstruct lock_class_key af_elock_keys[AF_MAX]; staticstruct lock_class_key af_kern_callback_keys[AF_MAX];
/** * sk_set_memalloc - sets %SOCK_MEMALLOC * @sk: socket to set it on * * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. * It's the responsibility of the admin to adjust min_free_kbytes * to meet the requirements
*/ void sk_set_memalloc(struct sock *sk)
{
sock_set_flag(sk, SOCK_MEMALLOC);
sk->sk_allocation |= __GFP_MEMALLOC;
static_branch_inc(&memalloc_socks_key);
}
EXPORT_SYMBOL_GPL(sk_set_memalloc);
/* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward * progress of swapping. SOCK_MEMALLOC may be cleared while * it has rmem allocations due to the last swapfile being deactivated * but there is a risk that the socket is unusable due to exceeding * the rmem limits. Reclaim the reserves and obey rmem limits again.
*/
sk_mem_reclaim(sk);
}
EXPORT_SYMBOL_GPL(sk_clear_memalloc);
int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{ int ret; unsignedint noreclaim_flag;
/* these should have been dropped before queueing */
BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
staticint sock_bindtoindex_locked(struct sock *sk, int ifindex)
{ int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk);
/* Sorry... */
ret = -EPERM; if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out;
ret = -EINVAL; if (ifindex < 0) goto out;
/* Paired with all READ_ONCE() done locklessly. */
WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
sk_dst_reset(sk);
ret = 0;
out: #endif
return ret;
}
int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
{ int ret;
if (lock_sk)
lock_sock(sk);
ret = sock_bindtoindex_locked(sk, ifindex); if (lock_sk)
release_sock(sk);
return ret;
}
EXPORT_SYMBOL(sock_bindtoindex);
staticint sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
{ int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES struct net *net = sock_net(sk); char devname[IFNAMSIZ]; int index;
ret = -EINVAL; if (optlen < 0) goto out;
/* Bind this socket to a particular device like "eth0", * as specified in the passed interface name. If the * name is "" or the option length is zero the socket * is not bound.
*/ if (optlen > IFNAMSIZ - 1)
optlen = IFNAMSIZ - 1;
memset(devname, 0, sizeof(devname));
ret = -EFAULT; if (copy_from_sockptr(devname, optval, optlen)) goto out;
index = 0; if (devname[0] != '\0') { struct net_device *dev;
rcu_read_lock();
dev = dev_get_by_name_rcu(net, devname); if (dev)
index = dev->ifindex;
rcu_read_unlock();
ret = -ENODEV; if (!dev) goto out;
}
sockopt_lock_sock(sk);
ret = sock_bindtoindex_locked(sk, index);
sockopt_release_sock(sk);
out: #endif
return ret;
}
staticint sock_getbindtodevice(struct sock *sk, sockptr_t optval,
sockptr_t optlen, int len)
{ int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); struct net *net = sock_net(sk); char devname[IFNAMSIZ];
if (bound_dev_if == 0) {
len = 0; goto zero;
}
ret = -EINVAL; if (len < IFNAMSIZ) goto out;
ret = netdev_get_name(net, devname, bound_dev_if); if (ret) goto out;
len = strlen(devname) + 1;
ret = -EFAULT; if (copy_to_sockptr(optval, devname, len)) goto out;
zero:
ret = -EFAULT; if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out;
ret = 0;
out: #endif
return ret;
}
bool sk_mc_loop(conststruct sock *sk)
{ if (dev_recursion_level()) returnfalse; if (!sk) returntrue; /* IPV6_ADDRFORM can change sk->sk_family under us. */ switch (READ_ONCE(sk->sk_family)) { case AF_INET: return inet_test_bit(MC_LOOP, sk); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return inet6_test_bit(MC6_LOOP, sk); #endif
}
WARN_ON_ONCE(1); returntrue;
}
EXPORT_SYMBOL(sk_mc_loop);
void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
{ switch (optname) { case SO_TIMESTAMP_OLD:
__sock_set_timestamps(sk, valbool, false, false); break; case SO_TIMESTAMP_NEW:
__sock_set_timestamps(sk, valbool, true, false); break; case SO_TIMESTAMPNS_OLD:
__sock_set_timestamps(sk, valbool, false, true); break; case SO_TIMESTAMPNS_NEW:
__sock_set_timestamps(sk, valbool, true, true); break;
}
}
staticint sock_timestamping_bind_phc(struct sock *sk, int phc_index)
{ struct net *net = sock_net(sk); struct net_device *dev = NULL; bool match = false; int *vclock_index; int i, num;
if (sk->sk_bound_dev_if)
dev = dev_get_by_index(net, sk->sk_bound_dev_if);
if (!dev) {
pr_err("%s: sock not bind to device\n", __func__); return -EOPNOTSUPP;
}
num = ethtool_get_phc_vclocks(dev, &vclock_index);
dev_put(dev);
for (i = 0; i < num; i++) { if (*(vclock_index + i) == phc_index) {
match = true; break;
}
}
if (num > 0)
kfree(vclock_index);
if (!match) return -EINVAL;
WRITE_ONCE(sk->sk_bind_phc, phc_index);
return 0;
}
int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping)
{ int val = timestamping.flags; int ret;
if (val & ~SOF_TIMESTAMPING_MASK) return -EINVAL;
if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
!(val & SOF_TIMESTAMPING_OPT_ID)) return -EINVAL;
if (val & SOF_TIMESTAMPING_OPT_ID &&
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) &
(TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); else
atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
} else {
atomic_set(&sk->sk_tskey, 0);
}
}
if (val & SOF_TIMESTAMPING_OPT_STATS &&
!(val & SOF_TIMESTAMPING_OPT_TSONLY)) return -EINVAL;
if (val & SOF_TIMESTAMPING_BIND_PHC) {
ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); if (ret) return ret;
}
staticvoid __sock_set_rcvbuf(struct sock *sk, int val)
{ /* Ensure val * 2 fits into an int, to prevent max_t() from treating it * as a negative value.
*/
val = min_t(int, val, INT_MAX / 2);
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
/* We double it on the way in to account for "struct sk_buff" etc. * overhead. Applications assume that the SO_RCVBUF setting they make * will allow that much actual data to be received on that socket. * * Applications are unaware that "struct sk_buff" and other overheads * allocate from the receive buffer during socket buffer allocation. * * And after considering the possible alternatives, returning the value * we actually used in getsockopt is the most desirable behavior.
*/
WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
}
staticvoid sock_release_reserved_memory(struct sock *sk, int bytes)
{ /* Round down bytes to multiple of pages */
bytes = round_down(bytes, PAGE_SIZE);
staticint sock_reserve_memory(struct sock *sk, int bytes)
{ long allocated; bool charged; int pages;
if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) return -EOPNOTSUPP;
if (!bytes) return 0;
pages = sk_mem_pages(bytes);
/* pre-charge to memcg */
charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM;
/* pre-charge to forward_alloc */
sk_memory_allocated_add(sk, pages);
allocated = sk_memory_allocated(sk); /* If the system goes into memory pressure with this * precharge, give up and return error.
*/ if (allocated > sk_prot_mem_limits(sk, 1)) {
sk_memory_allocated_sub(sk, pages);
mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); return -ENOMEM;
}
sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED * in 1 syscall. The limit exists to limit the amount of memory the kernel * allocates to copy these tokens, and to prevent looping over the frags for * too long.
*/ #define MAX_DONTNEED_TOKENS 128 #define MAX_DONTNEED_FRAGS 1024
static noinline_for_stack int
sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsignedint optlen)
{ unsignedint num_tokens, i, j, k, netmem_num = 0; struct dmabuf_token *tokens; int ret = 0, num_frags = 0;
netmem_ref netmems[16];
if (copy_from_sockptr(tokens, optval, optlen)) {
kvfree(tokens); return -EFAULT;
}
xa_lock_bh(&sk->sk_user_frags); for (i = 0; i < num_tokens; i++) { for (j = 0; j < tokens[i].token_count; j++) { if (++num_frags > MAX_DONTNEED_FRAGS) goto frag_limit_reached;
if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) continue;
netmems[netmem_num++] = netmem; if (netmem_num == ARRAY_SIZE(netmems)) {
xa_unlock_bh(&sk->sk_user_frags); for (k = 0; k < netmem_num; k++)
WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
netmem_num = 0;
xa_lock_bh(&sk->sk_user_frags);
}
ret++;
}
}
frag_limit_reached:
xa_unlock_bh(&sk->sk_user_frags); for (k = 0; k < netmem_num; k++)
WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
kvfree(tokens); return ret;
} #endif
void sockopt_lock_sock(struct sock *sk)
{ /* When current->bpf_ctx is set, the setsockopt is called from * a bpf prog. bpf has ensured the sk lock has been * acquired before calling setsockopt().
*/ if (has_current_bpf_ctx()) return;
staticint sockopt_validate_clockid(__kernel_clockid_t value)
{ switch (value) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_TAI: return 0;
} return -EINVAL;
}
/* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic.
*/
int sk_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsignedint optlen)
{ struct so_timestamping timestamping; struct socket *sock = sk->sk_socket; struct sock_txtime sk_txtime; int val; int valbool; struct linger ling; int ret = 0;
/* * Options without arguments
*/
if (optname == SO_BINDTODEVICE) return sock_setbindtodevice(sk, optval, optlen);
if (optlen < sizeof(int)) return -EINVAL;
if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT;
valbool = val ? 1 : 0;
/* handle options which do not require locking the socket. */ switch (optname) { case SO_PRIORITY: if (sk_set_prio_allowed(sk, val)) {
sock_set_priority(sk, val); return 0;
} return -EPERM; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: case SO_ERROR: return -ENOPROTOOPT; #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: if (val < 0) return -EINVAL;
WRITE_ONCE(sk->sk_ll_usec, val); return 0; case SO_PREFER_BUSY_POLL: if (valbool && !sockopt_capable(CAP_NET_ADMIN)) return -EPERM;
WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); return 0; case SO_BUSY_POLL_BUDGET: if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
!sockopt_capable(CAP_NET_ADMIN)) return -EPERM; if (val < 0 || val > U16_MAX) return -EINVAL;
WRITE_ONCE(sk->sk_busy_poll_budget, val); return 0; #endif case SO_MAX_PACING_RATE:
{ unsignedlong ulval = (val == ~0U) ? ~0UL : (unsignedint)val; unsignedlong pacing_rate;
if (sizeof(ulval) != sizeof(val) &&
optlen >= sizeof(ulval) &&
copy_from_sockptr(&ulval, optval, sizeof(ulval))) { return -EFAULT;
} if (ulval != ~0UL)
cmpxchg(&sk->sk_pacing_status,
SK_PACING_NONE,
SK_PACING_NEEDED); /* Pairs with READ_ONCE() from sk_getsockopt() */
WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
pacing_rate = READ_ONCE(sk->sk_pacing_rate); if (ulval < pacing_rate)
WRITE_ONCE(sk->sk_pacing_rate, ulval); return 0;
} case SO_TXREHASH: if (!sk_is_tcp(sk)) return -EOPNOTSUPP; if (val < -1 || val > 1) return -EINVAL; if ((u8)val == SOCK_TXREHASH_DEFAULT)
val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); /* Paired with READ_ONCE() in tcp_rtx_synack() * and sk_getsockopt().
*/
WRITE_ONCE(sk->sk_txrehash, (u8)val); return 0; case SO_PEEK_OFF:
{ int (*set_peek_off)(struct sock *sk, int val);
set_peek_off = READ_ONCE(sock->ops)->set_peek_off; if (set_peek_off)
ret = set_peek_off(sk, val); else
ret = -EOPNOTSUPP; return ret;
} #ifdef CONFIG_PAGE_POOL case SO_DEVMEM_DONTNEED: return sock_devmem_dontneed(sk, optval, optlen); #endif case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW: return sock_set_timeout(&sk->sk_sndtimeo, optval,
optlen, optname == SO_SNDTIMEO_OLD); case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW: return sock_set_timeout(&sk->sk_rcvtimeo, optval,
optlen, optname == SO_RCVTIMEO_OLD);
}
sockopt_lock_sock(sk);
switch (optname) { case SO_DEBUG: if (val && !sockopt_capable(CAP_NET_ADMIN))
ret = -EACCES; else
sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR:
sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: if (valbool && !sk_is_inet(sk))
ret = -EOPNOTSUPP; else
sk->sk_reuseport = valbool; break; case SO_DONTROUTE:
sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
sk_dst_reset(sk); break; case SO_BROADCAST:
sock_valbool_flag(sk, SOCK_BROADCAST, valbool); break; case SO_SNDBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints
*/
val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf: /* Ensure val * 2 fits into an int, to prevent max_t() * from treating it as a negative value.
*/
val = min_t(int, val, INT_MAX / 2);
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
WRITE_ONCE(sk->sk_sndbuf,
max_t(int, val * 2, SOCK_MIN_SNDBUF)); /* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk); break;
case SO_SNDBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM; break;
}
/* No negative values (to prevent underflow, as val will be * multiplied by 2).
*/ if (val < 0)
val = 0; goto set_sndbuf;
case SO_RCVBUF: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints
*/
__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break;
case SO_RCVBUFFORCE: if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM; break;
}
/* No negative values (to prevent underflow, as val will be * multiplied by 2).
*/
__sock_set_rcvbuf(sk, max(val, 0)); break;
case SO_KEEPALIVE: if (sk->sk_prot->keepalive)
sk->sk_prot->keepalive(sk, valbool);
sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break;
case SO_OOBINLINE:
sock_valbool_flag(sk, SOCK_URGINLINE, valbool); break;
case SO_NO_CHECK:
sk->sk_no_check_tx = valbool; break;
case SO_LINGER: if (optlen < sizeof(ling)) {
ret = -EINVAL; /* 1003.1g */ break;
} if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
ret = -EFAULT; break;
} if (!ling.l_onoff) {
sock_reset_flag(sk, SOCK_LINGER);
} else { unsignedlong t_sec = ling.l_linger;
case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: case SO_TIMESTAMPNS_OLD: case SO_TIMESTAMPNS_NEW:
sock_set_timestamp(sk, optname, valbool); break;
case SO_TIMESTAMPING_NEW: case SO_TIMESTAMPING_OLD: if (optlen == sizeof(timestamping)) { if (copy_from_sockptr(×tamping, optval, sizeof(timestamping))) {
ret = -EFAULT; break;
}
} else {
memset(×tamping, 0, sizeof(timestamping));
timestamping.flags = val;
}
ret = sock_set_timestamping(sk, optname, timestamping); break;
case SO_RCVLOWAT:
{ int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
if (val < 0)
val = INT_MAX; if (sock)
set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; if (set_rcvlowat)
ret = set_rcvlowat(sk, val); else
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break;
} case SO_ATTACH_FILTER: { struct sock_fprog fprog;
ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret)
ret = sk_attach_filter(&fprog, sk); break;
} case SO_ATTACH_BPF:
ret = -EINVAL; if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break;
ret = sk_attach_bpf(ufd, sk);
} break;
case SO_ATTACH_REUSEPORT_CBPF: { struct sock_fprog fprog;
ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); if (!ret)
ret = sk_reuseport_attach_filter(&fprog, sk); break;
} case SO_ATTACH_REUSEPORT_EBPF:
ret = -EINVAL; if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT; if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) break;
ret = sk_reuseport_attach_bpf(ufd, sk);
} break;
case SO_DETACH_REUSEPORT_BPF:
ret = reuseport_detach_prog(sk); break;
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk); break;
case SO_LOCK_FILTER: if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
ret = -EPERM; else
sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); break;
case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM; break;
}
__sock_set_mark(sk, val); break; case SO_RCVMARK:
sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break;
case SO_RCVPRIORITY:
sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); break;
case SO_RXQ_OVFL:
sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break;
case SO_WIFI_STATUS:
sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); break;
case SO_NOFCS:
sock_valbool_flag(sk, SOCK_NOFCS, valbool); break;
case SO_SELECT_ERR_QUEUE:
sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break;
case SO_PASSCRED: if (sk_may_scm_recv(sk))
sk->sk_scm_credentials = valbool; else
ret = -EOPNOTSUPP; break;
case SO_PASSSEC: if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
sk->sk_scm_security = valbool; else
ret = -EOPNOTSUPP; break;
case SO_PASSPIDFD: if (sk_is_unix(sk))
sk->sk_scm_pidfd = valbool; else
ret = -EOPNOTSUPP; break;
case SO_PASSRIGHTS: if (sk_is_unix(sk))
sk->sk_scm_rights = valbool; else
ret = -EOPNOTSUPP; break;
case SO_INCOMING_CPU:
reuseport_update_incoming_cpu(sk, val); break;
case SO_CNX_ADVICE: if (val == 1)
dst_negative_advice(sk); break;
case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { if (!(sk_is_tcp(sk) ||
(sk->sk_type == SOCK_DGRAM &&
sk->sk_protocol == IPPROTO_UDP)))
ret = -EOPNOTSUPP;
} elseif (sk->sk_family != PF_RDS) {
ret = -EOPNOTSUPP;
} if (!ret) { if (val < 0 || val > 1)
ret = -EINVAL; else
sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
} break;
case SO_TXTIME: if (optlen != sizeof(struct sock_txtime)) {
ret = -EINVAL; break;
} elseif (copy_from_sockptr(&sk_txtime, optval, sizeof(struct sock_txtime))) {
ret = -EFAULT; break;
} elseif (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
ret = -EINVAL; break;
} /* CLOCK_MONOTONIC is only used by sch_fq, and this packet * scheduler has enough safe guards.
*/ if (sk_txtime.clockid != CLOCK_MONOTONIC &&
!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM; break;
}
ret = sockopt_validate_clockid(sk_txtime.clockid); if (ret) break;
case SO_TIMESTAMPNS_OLD:
v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); break;
case SO_TIMESTAMP_NEW:
v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); break;
case SO_TIMESTAMPNS_NEW:
v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); break;
case SO_TIMESTAMPING_OLD: case SO_TIMESTAMPING_NEW:
lv = sizeof(v.timestamping); /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only * returning the flags when they were set through the same option. * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
*/ if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
} break;
case SO_RCVTIMEO_OLD: case SO_RCVTIMEO_NEW:
lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
SO_RCVTIMEO_OLD == optname); break;
case SO_SNDTIMEO_OLD: case SO_SNDTIMEO_NEW:
lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
SO_SNDTIMEO_OLD == optname); break;
case SO_RCVLOWAT:
v.val = READ_ONCE(sk->sk_rcvlowat); break;
case SO_SNDLOWAT:
v.val = 1; break;
case SO_PASSCRED: if (!sk_may_scm_recv(sk)) return -EOPNOTSUPP;
v.val = sk->sk_scm_credentials; break;
case SO_PASSPIDFD: if (!sk_is_unix(sk)) return -EOPNOTSUPP;
v.val = sk->sk_scm_pidfd; break;
case SO_PASSRIGHTS: if (!sk_is_unix(sk)) return -EOPNOTSUPP;
v.val = sk->sk_scm_rights; break;
case SO_PEERCRED:
{ struct ucred peercred; if (len > sizeof(peercred))
len = sizeof(peercred);
/* The use of PIDFD_STALE requires stashing of struct pid * on pidfs with pidfs_register_pid() and only AF_UNIX * were prepared for this.
*/ if (sk->sk_family == AF_UNIX)
flags = PIDFD_STALE;
case SO_PEERGROUPS:
{ conststruct cred *cred; int ret, n;
cred = sk_get_peer_cred(sk); if (!cred) return -ENODATA;
n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) {
len = n * sizeof(gid_t);
put_cred(cred); return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
}
len = n * sizeof(gid_t);
ret = groups_to_user(optval, cred->group_info);
put_cred(cred); if (ret) return ret; goto lenout;
}
case SO_PEERNAME:
{ struct sockaddr_storage address;
lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; if (copy_to_sockptr(optval, &address, len)) return -EFAULT; goto lenout;
}
/* Dubious BSD thing... Probably nobody even uses it, but * the UNIX standard wants it for whatever reason... -DaveM
*/ case SO_ACCEPTCONN:
v.val = sk->sk_state == TCP_LISTEN; break;
case SO_PASSSEC: if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) return -EOPNOTSUPP;
v.val = sk->sk_scm_security; break;
case SO_PEERSEC: return security_socket_getpeersec_stream(sock,
optval, optlen, len);
case SO_MARK:
v.val = READ_ONCE(sk->sk_mark); break;
case SO_RCVMARK:
v.val = sock_flag(sk, SOCK_RCVMARK); break;
case SO_RCVPRIORITY:
v.val = sock_flag(sk, SOCK_RCVPRIORITY); break;
case SO_RXQ_OVFL:
v.val = sock_flag(sk, SOCK_RXQ_OVFL); break;
case SO_WIFI_STATUS:
v.val = sock_flag(sk, SOCK_WIFI_STATUS); break;
case SO_PEEK_OFF: if (!READ_ONCE(sock->ops)->set_peek_off) return -EOPNOTSUPP;
case SO_BINDTOIFINDEX:
v.val = READ_ONCE(sk->sk_bound_dev_if); break;
case SO_NETNS_COOKIE:
lv = sizeof(u64); if (len != lv) return -EINVAL;
v.val64 = sock_net(sk)->net_cookie; break;
case SO_BUF_LOCK:
v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; break;
case SO_RESERVE_MEM:
v.val = READ_ONCE(sk->sk_reserved_mem); break;
case SO_TXREHASH: if (!sk_is_tcp(sk)) return -EOPNOTSUPP;
/* Paired with WRITE_ONCE() in sk_setsockopt() */
v.val = READ_ONCE(sk->sk_txrehash); break;
default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7).
*/ return -ENOPROTOOPT;
}
if (len > lv)
len = lv; if (copy_to_sockptr(optval, &v, len)) return -EFAULT;
lenout: if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0;
}
/* * Initialize an sk_lock. * * (We also register the sk_lock with the lock validator.)
*/ staticinlinevoid sock_lock_init(struct sock *sk)
{
sk_owner_clear(sk);
/* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, * even temporarily, because of RCU lookups. sk_node should also be left as is. * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
*/ staticvoid sock_copy(struct sock *nsk, conststruct sock *osk)
{ conststruct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif
/* If we move sk_tx_queue_mapping out of the private section, * we must check if sk_tx_queue_clear() is called after * sock_copy() in sk_clone_lock().
*/
BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
offsetof(struct sock, sk_dontcopy_begin) ||
offsetof(struct sock, sk_tx_queue_mapping) >=
offsetof(struct sock, sk_dontcopy_end));
if (slab != NULL)
kmem_cache_free(slab, sk); else
kfree(sk);
module_put(owner);
}
/** * sk_alloc - All socket objects are allocated here * @net: the applicable net namespace * @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance * @kern: is this to be a kernel socket?
*/ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern)
{ struct sock *sk;
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); if (sk) {
sk->sk_family = family; /* * See comment in struct sock definition to understand * why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sk->sk_kern_sock = kern;
sock_lock_init(sk);
sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) {
get_net_track(net, &sk->ns_tracker, priority);
sock_inuse_add(net, 1);
} else {
net_passive_inc(net);
__netns_tracker_alloc(net, &sk->ns_tracker, false, priority);
}
/* Sockets having SOCK_RCU_FREE will call this function after one RCU * grace period. This is the case for UDP sockets and TCP listeners.
*/ staticvoid __sk_destruct(struct rcu_head *head)
{ struct sock *sk = container_of(head, struct sock, sk_rcu); struct net *net = sock_net(sk); struct sk_filter *filter;
if (rcu_access_pointer(sk->sk_reuseport_cb)) {
reuseport_detach_sock(sk);
use_call_rcu = true;
}
if (use_call_rcu)
call_rcu(&sk->sk_rcu, __sk_destruct); else
__sk_destruct(&sk->sk_rcu);
}
staticvoid __sk_free(struct sock *sk)
{ if (likely(sk->sk_net_refcnt))
sock_inuse_add(sock_net(sk), -1);
if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
sock_diag_broadcast_destroy(sk); else
sk_destruct(sk);
}
void sk_free(struct sock *sk)
{ /* * We subtract one from sk_wmem_alloc and can know if * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later
*/ if (refcount_dec_and_test(&sk->sk_wmem_alloc))
__sk_free(sk);
}
EXPORT_SYMBOL(sk_free);
/** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
*/ struct sock *sk_clone_lock(conststruct sock *sk, const gfp_t priority)
{ struct proto *prot = READ_ONCE(sk->sk_prot); struct sk_filter *filter; bool is_charged = true; struct sock *newsk;
newsk = sk_prot_alloc(prot, priority, sk->sk_family); if (!newsk) goto out;
sock_copy(newsk, sk);
newsk->sk_prot_creator = prot;
/* SANITY */ if (likely(newsk->sk_net_refcnt)) {
get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
sock_inuse_add(sock_net(newsk), 1);
} else { /* Kernel sockets are not elevating the struct net refcount. * Instead, use a tracker to more easily detect if a layer * is not properly dismantling its kernel sockets at netns * destroy time.
*/
net_passive_inc(sock_net(newsk));
__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority);
}
sk_node_init(&newsk->sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);
newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
newsk->sk_backlog.len = 0;
atomic_set(&newsk->sk_rmem_alloc, 0);
/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
refcount_set(&newsk->sk_wmem_alloc, 1);
/* sk->sk_memcg will be populated at accept() time */
newsk->sk_memcg = NULL;
cgroup_sk_clone(&newsk->sk_cgrp_data);
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning
*/
is_charged = sk_filter_charge(newsk, filter);
RCU_INIT_POINTER(newsk->sk_filter, filter);
rcu_read_unlock();
if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new * socket if we couldn't charge it in the first place * as otherwise we uncharge the parent's filter.
*/ if (!is_charged)
RCU_INIT_POINTER(newsk->sk_filter, NULL);
goto free;
}
RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
if (bpf_sk_storage_clone(sk, newsk)) goto free;
/* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning.
*/ if (sk_user_data_is_nocopy(newsk))
newsk->sk_user_data = NULL;
/* Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details)
*/
smp_wmb();
refcount_set(&newsk->sk_refcnt, 2);
if (newsk->sk_prot->sockets_allocated)
sk_sockets_allocated_inc(newsk);
if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
net_enable_timestamp();
out: return newsk;
free: /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free()
*/
newsk->sk_destruct = NULL;
bh_unlock_sock(newsk);
sk_free(newsk);
newsk = NULL; goto out;
}
EXPORT_SYMBOL_GPL(sk_clone_lock);
rcu_read_lock();
dev = dst_dev_rcu(dst);
sk->sk_route_caps = dev->features; if (sk_is_tcp(sk)) { struct inet_connection_sock *icsk = inet_csk(sk);
sk->sk_route_caps |= NETIF_F_GSO;
icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
} if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; if (unlikely(sk->sk_gso_disabled))
sk->sk_route_caps &= ~NETIF_F_GSO_MASK; if (sk_can_gso(sk)) { if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
}
}
sk->sk_gso_max_segs = max_segs;
sk_dst_set(sk, dst);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
/* * Simple resource managers for sockets.
*/
/* * Write buffer destructor automatically called from kfree_skb.
*/ void sock_wfree(struct sk_buff *skb)
{ struct sock *sk = skb->sk; unsignedint len = skb->truesize; bool free;
if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { if (sock_flag(sk, SOCK_RCU_FREE) &&
sk->sk_write_space == sock_def_write_space) {
rcu_read_lock();
free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
sock_def_write_space_wfree(sk);
rcu_read_unlock(); if (unlikely(free))
__sk_free(sk); return;
}
/* * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call
*/
WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
sk->sk_write_space(sk);
len = 1;
} /*
--> --------------------
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.