// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The User Datagram Protocol (UDP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Hirokazu Takahashi, <taka@valinux.co.jp> * * Fixes: * Alan Cox : verify_area() calls * Alan Cox : stopped close while in use off icmp * messages. Not a fix but a botch that * for udp at least is 'valid'. * Alan Cox : Fixed icmp handling properly * Alan Cox : Correct error for oversized datagrams * Alan Cox : Tidied select() semantics. * Alan Cox : udp_err() fixed properly, also now * select and read wake correctly on errors * Alan Cox : udp_send verify_area moved to avoid mem leak * Alan Cox : UDP can count its memory * Alan Cox : send to an unknown connection causes * an ECONNREFUSED off the icmp, but * does NOT close. * Alan Cox : Switched to new sk_buff handlers. No more backlog! * Alan Cox : Using generic datagram code. Even smaller and the PEEK * bug no longer crashes it. * Fred Van Kempen : Net2e support for sk->broadcast. * Alan Cox : Uses skb_free_datagram * Alan Cox : Added get/set sockopt support. * Alan Cox : Broadcasting without option set returns EACCES. * Alan Cox : No wakeup calls. Instead we now use the callbacks. * Alan Cox : Use ip_tos and ip_ttl * Alan Cox : SNMP Mibs * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support. * Matt Dillon : UDP length checks. * Alan Cox : Smarter af_inet used properly. * Alan Cox : Use new kernel side addressing. * Alan Cox : Incorrect return on truncated datagram receive. * Arnt Gulbrandsen : New udp_send and stuff * Alan Cox : Cache last socket * Alan Cox : Route cache * Jon Peatfield : Minor efficiency fix to sendto(). * Mike Shaver : RFC1122 checks. * Alan Cox : Nonblocking error fix. * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * David S. Miller : New socket lookup architecture. * Last socket cache retained as it * does have a high hit rate. * Olaf Kirch : Don't linearise iovec on sendmsg. * Andi Kleen : Some cleanups, cache destination entry * for connect. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Melvin Smith : Check msg_name not msg_namelen in sendto(), * return ENOTCONN for unconnected sockets (POSIX) * Janos Farkas : don't deliver multi/broadcasts to a different * bound-to-device socket * Hirokazu Takahashi : HW checksumming for outgoing UDP * datagrams. * Hirokazu Takahashi : sendfile() on UDP works now. * Arnaldo C. Melo : convert /proc/net/udp to seq_file * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support * James Chapman : Add L2TP encapsulation type.
*/
/* * Note: we still hold spinlock of primary hash chain, so no other writer * can insert/delete a socket with local_port == num
*/ staticint udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, struct sock *sk)
{
kuid_t uid = sk_uid(sk); struct sock *sk2; int res = 0;
/** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * * @sk: socket struct in question * @snum: port number to look up * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, * with NULL address
*/ int udp_lib_get_port(struct sock *sk, unsignedshort snum, unsignedint hash2_nulladdr)
{ struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; struct net *net = sock_net(sk); int error = -EADDRINUSE;
if (!snum) {
DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); unsignedshort first, last; int low, high, remaining; unsignedint rand;
rand = get_random_u32();
first = reciprocal_scale(rand, remaining) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE
*/
rand = (rand | 1) * (udptable->mask + 1);
last = first + udptable->mask + 1; do {
hslot = udp_hashslot(udptable, net, first);
bitmap_zero(bitmap, PORTS_PER_CHAIN);
spin_lock_bh(&hslot->lock);
udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
udptable->log);
snum = first; /* * Iterate on all possible values of snum for this hash. * Using steps of an odd multiple of UDP_HTABLE_SIZE * give us randomization and full range coverage.
*/ do { if (low <= snum && snum <= high &&
!test_bit(snum >> udptable->log, bitmap) &&
!inet_is_local_reserved_port(net, snum)) goto found;
snum += rand;
} while (snum != first);
spin_unlock_bh(&hslot->lock);
cond_resched();
} while (++first != last); goto fail;
} else {
hslot = udp_hashslot(udptable, net, snum);
spin_lock_bh(&hslot->lock); if (hslot->count > 10) { int exist; unsignedint slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
/** * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port) * @net: Network namespace * @saddr: Source address, network order * @sport: Source port, network order * @daddr: Destination address, network order * @hnum: Destination port, host order * @dif: Destination interface index * @sdif: Destination bridge port index, if relevant * @udptable: Set of UDP hash tables * * Simplified lookup to be used as fallback if no sockets are found due to a * potential race between (receive) address change, and lookup happening before * the rehash operation. This function ignores SO_REUSEPORT groups while scoring * result sockets, because if we have one, we don't need the fallback at all. * * Called under rcu_read_lock(). * * Return: socket with highest matching score if any, NULL if none
*/ staticstruct sock *udp4_lib_lookup1(conststruct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsignedint hnum, int dif, int sdif, conststruct udp_table *udptable)
{ unsignedint slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot = &udptable->hash[slot]; struct sock *sk, *result = NULL; int score, badness = 0;
if (sk->sk_state == TCP_ESTABLISHED) {
result = sk; continue;
}
result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
saddr, sport, daddr, hnum, udp_ehashfn); if (!result) {
result = sk; continue;
}
/* Fall back to scoring if group has connections */ if (!reuseport_has_conns(sk)) return result;
/* Reuseport logic returned an error, keep original score. */ if (IS_ERR(result)) continue;
/* compute_score is too long of a function to be * inlined, and calling it again here yields * measureable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'.
*/
need_rescore = true; goto rescore;
}
} return result;
}
#if IS_ENABLED(CONFIG_BASE_SMALL) staticstruct sock *udp4_lib_lookup4(conststruct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsignedint hnum, int dif, int sdif, struct udp_table *udptable)
{ return NULL;
}
begin: /* SLAB_TYPESAFE_BY_RCU not used, so we don't need to touch sk_refcnt */
udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) {
sk = (struct sock *)up; if (inet_match(net, sk, acookie, ports, dif, sdif)) return sk;
}
/* if the nulls value we got at the end of this lookup is not the * expected one, we must restart lookup. We probably met an item that * was moved to another chain due to rehash.
*/ if (get_nulls_value(node) != slot) goto begin;
return NULL;
}
/* udp_rehash4() only checks hslot4, and hash4_cnt is not processed. */ staticvoid udp_rehash4(struct udp_table *udptable, struct sock *sk,
u16 newhash4)
{ struct udp_hslot *hslot4, *nhslot4;
/* Connected udp socket can re-connect to another remote address, which * will be handled by rehash. Thus no need to redo hash4 here.
*/ if (udp_hashed4(sk)) return;
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM
*/ struct sock *__udp4_lib_lookup(conststruct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb)
{ unsignedshort hnum = ntohs(dport); struct udp_hslot *hslot2; struct sock *result, *sk; unsignedint hash2;
if (udp_has_hash4(hslot2)) {
result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum,
dif, sdif, udptable); if (result) /* udp4_lib_lookup4 return sk or NULL */ return result;
}
/* Lookup connected or non-wildcard socket */
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif, sdif,
hslot2, skb); if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) goto done;
/* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
udptable == net->ipv4.udp_table) {
sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
saddr, sport, daddr, hnum, dif,
udp_ehashfn); if (sk) {
result = sk; goto done;
}
}
/* Got non-wildcard socket or error on first lookup */ if (result) goto done;
result = udp4_lib_lookup2(net, saddr, sport,
htonl(INADDR_ANY), hnum, dif, sdif,
hslot2, skb); if (!IS_ERR_OR_NULL(result)) goto done;
/* Primary hash (destination port) lookup as fallback for this race: * 1. __ip4_datagram_connect() sets sk_rcv_saddr * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet * 3. rehash operation updating _secondary and four-tuple_ hashes * The primary hash doesn't need an update after 1., so, thanks to this * further step, 1. and 3. don't need to be atomic against the lookup.
*/
result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif,
udptable);
done: if (IS_ERR(result)) return NULL; return result;
}
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
/* Must be called under rcu_read_lock(). * Does increment socket refcount.
*/ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4) struct sock *udp4_lib_lookup(conststruct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{ struct sock *sk;
sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
dif, 0, net->ipv4.udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL; return sk;
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup); #endif
staticinlinebool __udp_is_mcast_sock(struct net *net, conststruct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr, int dif, int sdif, unsignedshort hnum)
{ conststruct inet_sock *inet = inet_sk(sk);
/* Handler for tunnels with arbitrary destination ports: no socket lookup, go * through error handlers in encapsulations looking for a match.
*/ staticint __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
{ int i;
for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, u32 info); conststruct ip_tunnel_encap_ops *encap;
encap = rcu_dereference(iptun_encaps[i]); if (!encap) continue;
handler = encap->err_handler; if (handler && !handler(skb, info)) return 0;
}
return -ENOENT;
}
/* Try to match ICMP errors to UDP tunnels by looking up a socket without * reversing source and destination port: this will match tunnels that force the * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that * lwtunnels might actually break this assumption by being configured with * different destination ports on endpoints, in this case we won't be able to * trace ICMP messages back to them. * * If this doesn't match any socket, probe tunnels with arbitrary destination * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port * we've sent packets to won't necessarily match the local destination port. * * Then ask the tunnel implementation to match the error against a valid * association. * * Return an error if we can't find a match, the socket if we need further * processing, zero otherwise.
*/ staticstruct sock *__udp4_lib_err_encap(struct net *net, conststruct iphdr *iph, struct udphdr *uh, struct udp_table *udptable, struct sock *sk, struct sk_buff *skb, u32 info)
{ int (*lookup)(struct sock *sk, struct sk_buff *skb); int network_offset, transport_offset; struct udp_sock *up;
/* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. * Header points to the ip header of the error packet. We move * on past this. Then (as it used to claim before adjustment) * header points to the first 8 bytes of the udp header. We need * to find the appropriate port.
*/
if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udp_encap_needed_key)) {
sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb,
info); if (!sk) return 0;
} else
sk = ERR_PTR(-ENOENT);
if (IS_ERR(sk)) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return PTR_ERR(sk);
}
tunnel = true;
}
err = 0;
harderr = 0;
inet = inet_sk(sk);
switch (type) { default: case ICMP_TIME_EXCEEDED:
err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: goto out; case ICMP_PARAMETERPROB:
err = EPROTO;
harderr = 1; break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
ipv4_sk_update_pmtu(skb, sk, info); if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1; break;
} goto out;
}
err = EHOSTUNREACH; if (code <= NR_ICMP_UNREACH) {
harderr = icmp_err_convert[code].fatal;
err = icmp_err_convert[code].errno;
} break; case ICMP_REDIRECT:
ipv4_sk_redirect(skb, sk); goto out;
}
/* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3.
*/ if (tunnel) { /* ...not for tunnels though: we don't have a sending socket */ if (udp_sk(sk)->encap_err_rcv)
udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info,
(u8 *)(uh+1)); goto out;
} if (!inet_test_bit(RECVERR, sk)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out;
} else
ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
/* * Throw away all pending data and cancel the corking. Socket is locked.
*/ void udp_flush_pending_frames(struct sock *sk)
{ struct udp_sock *up = udp_sk(sk);
/** * udp4_hwcsum - handle outgoing HW checksumming * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) * @src: source IP address * @dst: destination IP address
*/ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
{ struct udphdr *uh = udp_hdr(skb); int offset = skb_transport_offset(skb); int len = skb->len - offset; int hlen = len;
__wsum csum = 0;
if (!skb_has_frag_list(skb)) { /* * Only one fragment on the socket.
*/
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
uh->check = ~csum_tcpudp_magic(src, dst, len,
IPPROTO_UDP, 0);
} else { struct sk_buff *frags;
/* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together
*/
skb_walk_frags(skb, frags) {
csum = csum_add(csum, frags->csum);
hlen -= frags->len;
}
/* Function to set UDP checksum for an IPv4 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel.
*/ void udp_set_csum(bool nocheck, struct sk_buff *skb,
__be32 saddr, __be32 daddr, int len)
{ struct udphdr *uh = udp_hdr(skb);
fl4 = &inet->cork.fl.u.ip4; if (READ_ONCE(up->pending)) { /* * There are pending frames. * The socket lock must be held while it's corked.
*/
lock_sock(sk); if (likely(up->pending)) { if (unlikely(up->pending != AF_INET)) {
release_sock(sk); return -EINVAL;
} goto do_append_data;
}
release_sock(sk);
}
ulen += sizeof(struct udphdr);
/* * Get and verify the address.
*/ if (usin) { if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { if (usin->sin_family != AF_UNSPEC) return -EAFNOSUPPORT;
}
daddr = usin->sin_addr.s_addr;
dport = usin->sin_port; if (dport == 0) return -EINVAL;
} else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ;
daddr = inet->inet_daddr;
dport = inet->inet_dport; /* Open fast path for connected socket. Route will not be used, if at least one option is set.
*/
connected = 1;
}
uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = READ_ONCE(inet->mc_index); if (!saddr)
saddr = READ_ONCE(inet->mc_addr);
connected = 0;
} elseif (!ipc.oif) {
ipc.oif = uc_index;
} elseif (ipv4_is_lbcast(daddr) && uc_index) { /* oif is set, packet is to local broadcast and * uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index.
*/ if (ipc.oif != uc_index &&
ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
uc_index)) {
ipc.oif = uc_index;
}
}
if (connected)
rt = dst_rtable(sk_dst_check(sk, 0));
if (!rt) { struct net *net = sock_net(sk);
__u8 flow_flags = inet_sk_flowi_flags(sk);
lock_sock(sk); if (unlikely(up->pending)) { /* The socket is already corked while preparing it. */ /* ... which is an evident application bug. --ANK */
release_sock(sk);
out:
ip_rt_put(rt);
out_free: if (free)
kfree(ipc.opt); if (!err) return len; /* * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting * ENOBUFS might not be good (it's not tunable per se), but otherwise * we don't have a good statistic (IpOutDiscards but it can be too many * things). We could add another new stat but at least for now that * seems like overkill.
*/ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
UDP_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
} return err;
do_confirm: if (msg->msg_flags & MSG_PROBE)
dst_confirm_neigh(&rt->dst, &fl4->daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm;
err = 0; goto out;
}
EXPORT_SYMBOL(udp_sendmsg);
if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk)) return;
lock_sock(sk); if (up->pending && !udp_test_bit(CORK, sk))
udp_push_pending_frames(sk);
release_sock(sk);
}
EXPORT_IPV6_MOD_GPL(udp_splice_eof);
#define UDP_SKB_IS_STATELESS 0x80000000
/* all head states (dst, sk, nf conntrack) except skb extensions are * cleared by udp_rcv(). * * We need to preserve secpath, if present, to eventually process * IP_CMSG_PASSSEC at recvmsg() time. * * Other extensions can be cleared.
*/ staticbool udp_try_make_stateless(struct sk_buff *skb)
{ if (!skb_has_extensions(skb)) returntrue;
if (!secpath_exists(skb)) {
skb_ext_reset(skb); returntrue;
}
staticvoid udp_skb_csum_unnecessary_set(struct sk_buff *skb)
{ /* We come here after udp_lib_checksum_complete() returned 0. * This means that __skb_checksum_complete() might have * set skb->csum_valid to 1. * On 64bit platforms, we can set csum_unnecessary * to true, but only if the skb is not shared.
*/ #if BITS_PER_LONG == 64 if (!skb_shared(skb))
udp_skb_scratch(skb)->csum_unnecessary = true; #endif
}
/* acquire the sk_receive_queue for fwd allocated memory scheduling, * if the called don't held it already
*/
sk_queue = &sk->sk_receive_queue; if (!rx_queue_lock_held)
spin_lock(&sk_queue->lock);
if (amt)
__sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
atomic_sub(size, &sk->sk_rmem_alloc);
/* this can save us from acquiring the rx queue lock on next receive */
skb_queue_splice_tail_init(sk_queue, &up->reader_queue);
if (!rx_queue_lock_held)
spin_unlock(&sk_queue->lock);
}
/* Note: called with reader_queue.lock held. * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch * This avoids a cache line miss while receive_queue lock is held. * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
*/ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
{
prefetch(&skb->data);
udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
}
EXPORT_IPV6_MOD(udp_skb_destructor);
/* as above, but the caller held the rx queue lock, too */ staticvoid udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
{
prefetch(&skb->data);
udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
}
/* Idea of busylocks is to let producers grab an extra spinlock * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line * trying to acquire the receive_queue spinlock. * These busylock can be allocated on a per cpu manner, instead of a * per socket one (that would consume a cache line per socket)
*/ staticint udp_busylocks_log __read_mostly; static spinlock_t *udp_busylocks __read_mostly;
/* Immediately drop when the receive queue is full. * Cast to unsigned int performs the boundary check for INT_MAX.
*/ if (rmem + size > rcvbuf) { if (rcvbuf > INT_MAX >> 1) goto drop;
/* Always allow at least one packet for small buffer. */ if (rmem > rcvbuf) goto drop;
}
/* Under mem pressure, it might be helpful to help udp_recvmsg() * having linear skbs : * - Reduce memory overhead and thus increase receive queue capacity * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing)
*/ if (rmem > (rcvbuf >> 1)) {
skb_condense(skb);
size = skb->truesize;
busy = busylock_acquire(sk);
}
void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
{ if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset)))
sk_peek_offset_bwd(sk, len);
if (!skb_unref(skb)) return;
/* In the more common cases we cleared the head states previously, * see __udp_queue_rcv_skb().
*/ if (unlikely(udp_skb_has_head_state(skb)))
skb_release_head_state(skb);
__consume_stateless_skb(skb);
}
EXPORT_IPV6_MOD_GPL(skb_consume_udp);
/** * first_packet_length - return length of first packet in receive queue * @sk: socket * * Drops all bad checksum frames, until a valid one is found. * Returns the length of found skb, or -1 if none is found.
*/ staticint first_packet_length(struct sock *sk)
{ struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; struct sk_buff_head *sk_queue = &sk->sk_receive_queue; unsignedint total = 0; struct sk_buff *skb; int res;
error = -EAGAIN; do {
spin_lock_bh(&queue->lock);
skb = __skb_try_recv_from_queue(queue, flags, off, err,
&last); if (skb) { if (!(flags & MSG_PEEK))
udp_skb_destructor(sk, skb);
spin_unlock_bh(&queue->lock); return skb;
}
if (skb_queue_empty_lockless(sk_queue)) {
spin_unlock_bh(&queue->lock); goto busy_check;
}
/* refill the reader queue and walk it again * keep both queues locked to avoid re-acquiring * the sk_receive_queue lock if fwd memory scheduling * is needed.
*/
spin_lock(&sk_queue->lock);
skb_queue_splice_tail_init(sk_queue, queue);
/* * If checksum is needed at all, try to do it while copying the * data. If the data is truncated, or if we only want a partial * coverage checksum (UDP-Lite), do it before the copy.
*/
/* starting over for a new packet, but check if we need to yield */
cond_resched();
msg->msg_flags &= ~MSG_TRUNC; goto try_again;
}
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{ /* This check is replicated from __ip4_datagram_connect() and * intended to prevent BPF program called below from accessing bytes * that are out of the bound specified by user in addr_len.
*/ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL;
if (hslot2 != nhslot2 ||
rcu_access_pointer(sk->sk_reuseport_cb)) { /* we must lock primary chain too */
spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
if (hslot2 != nhslot2) {
spin_lock(&hslot2->lock);
hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
hslot2->count--;
spin_unlock(&hslot2->lock);
/* Now process hash4 if necessary: * (1) update hslot4; * (2) update hslot2->hash4_cnt. * Note that hslot2/hslot4 should be checked separately, as * either of them may change with the other unchanged.
*/ if (udp_hashed4(sk)) {
spin_lock_bh(&hslot->lock);
/* returns: * -1: error * 0: success * >0: "udp encap" protocol resubmission * * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed.
*/ staticint udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{ enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk);
/* * Charge it to the socket, dropping if the queue is full.
*/ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto drop;
}
nf_reset_ct(skb);
if (static_branch_unlikely(&udp_encap_needed_key) &&
READ_ONCE(up->encap_type)) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/* * This is an encapsulation socket so pass the skb to * the socket's udp_encap_rcv() hook. Otherwise, just * fall through and pass this up the UDP socket. * up->encap_rcv() returns the following value: * =0 if skb was successfully passed to the encap * handler or was discarded by it. * >0 if skb should be passed on to UDP. * <0 if skb should be resubmitted as proto -N
*/
/* if we're overly short, let UDP handle it */
encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret;
/* Verify checksum before giving to encap */ if (udp_lib_checksum_complete(skb)) goto csum_error;
ret = encap_rcv(sk, skb); if (ret <= 0) {
__UDP_INC_STATS(sock_net(sk),
UDP_MIB_INDATAGRAMS,
is_udplite); return -ret;
}
}
/* FALLTHROUGH -- it's a UDP Packet */
}
/* * UDP-Lite specific tests, ignored on UDP sockets
*/ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
u16 pcrlen = READ_ONCE(up->pcrlen);
/* * MIB statistics other than incrementing the error count are * disabled for the following two types of errors: these depend * on the application settings, not on the functioning of the * protocol stack as such. * * RFC 3828 here recommends (sec 3.3): "There should also be a * way ... to ... at least let the receiving application block * delivery of packets with coverage values less than a value * provided by the application."
*/ if (pcrlen == 0) { /* full coverage was set */
net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
UDP_SKB_CB(skb)->cscov, skb->len); goto drop;
} /* The next case involves violating the min. coverage requested * by the receiver. This is subtle: if receiver wants x and x is * greater than the buffersize/MTU then receiver will complain * that it wants x while sender emits packets of smaller size y. * Therefore the above ...()->partial_cov statement is essential.
*/ if (UDP_SKB_CB(skb)->cscov < pcrlen) {
net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
UDP_SKB_CB(skb)->cscov, pcrlen); goto drop;
}
}
prefetch(&sk->sk_rmem_alloc); if (rcu_access_pointer(sk->sk_filter) &&
udp_lib_checksum_complete(skb)) goto csum_error;
if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) goto drop;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.