// SPDX-License-Identifier: GPL-2.0-or-later /* * TCP over IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on: * linux/net/ipv4/tcp.c * linux/net/ipv4/tcp_input.c * linux/net/ipv4/tcp_output.c * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file.
*/
/* Helper returning the inet6 address from a given tcp socket. * It can be used in TCP stack instead of inet6_sk(sk). * This avoids a dereference and allow compiler optimizations. * It is a specialized version of inet6_sk_generic().
*/ #define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \ struct tcp6_sock, tcp)->inet6)
staticint tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{ /* This check is replicated from tcp_v6_connect() and intended to * prevent BPF program called below from accessing bytes that are out * of the bound specified by user in addr_len.
*/ if (addr_len < SIN6_LEN_RFC2133) return -EINVAL;
if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT;
memset(&fl6, 0, sizeof(fl6));
if (inet6_test_bit(SNDFLOW, sk)) {
fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
IP6_ECN_flow_init(fl6.flowlabel); if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel;
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL;
fl6_sock_release(flowlabel);
}
}
/* * connect() to INADDR_ANY means loopback (BSD'ism).
*/
if (ipv6_addr_any(&usin->sin6_addr)) { if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
&usin->sin6_addr); else
usin->sin6_addr = in6addr_loopback;
}
addr_type = ipv6_addr_type(&usin->sin6_addr);
if (addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH;
if (addr_type&IPV6_ADDR_LINKLOCAL) { if (addr_len >= sizeof(struct sockaddr_in6) &&
usin->sin6_scope_id) { /* If interface is set while binding, indices * must coincide.
*/ if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) return -EINVAL;
sk->sk_bound_dev_if = usin->sin6_scope_id;
}
/* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) return -EINVAL;
}
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return;
mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
/* Drop requests trying to increase our current mss. * Check done in __ip6_rt_update_pmtu() is too late.
*/ if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache) return;
dst = inet6_csk_update_pmtu(sk, mtu); if (!dst) return;
if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
tcp_sync_mss(sk, dst_mtu(dst));
tcp_simple_retransmit(sk);
}
}
sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
&hdr->daddr, th->dest,
&hdr->saddr, ntohs(th->source),
skb->dev->ifindex, inet6_sdif(skb));
if (!sk) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS); return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) { /* To increase the counter of ignored icmps for TCP-AO */
tcp_ao_ignore_icmp(sk, AF_INET6, type, code);
inet_twsk_put(inet_twsk(sk)); return 0;
}
seq = ntohl(th->seq);
fatal = icmpv6_err_convert(type, code, &err); if (sk->sk_state == TCP_NEW_SYN_RECV) {
tcp_req_err(sk, seq, fatal); return 0;
}
if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) {
sock_put(sk); return 0;
}
bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == TCP_CLOSE) goto out;
if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out;
}
}
if (type == NDISC_REDIRECT) { if (!sock_owned_by_user(sk)) { struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
if (dst)
dst->ops->redirect(dst, sk, skb);
} goto out;
}
if (type == ICMPV6_PKT_TOOBIG) {
u32 mtu = ntohl(info);
/* We are not interested in TCP_LISTEN and open_requests * (SYN-ACKs send out by Linux are always <576bytes so * they should go through unfragmented).
*/ if (sk->sk_state == TCP_LISTEN) goto out;
if (!ip6_sk_accept_pmtu(sk)) goto out;
if (mtu < IPV6_MIN_MTU) goto out;
WRITE_ONCE(tp->mtu_info, mtu);
if (!sock_owned_by_user(sk))
tcp_v6_mtu_reduced(sk); elseif (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
&sk->sk_tsq_flags))
sock_hold(sk); goto out;
}
/* Might be for an request_sock */ switch (sk->sk_state) { case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is * already accepted it is treated as a connected one below.
*/ if (fastopen && !fastopen->sk) break;
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL;
if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3];
/* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd()
*/ if (tcp_ao_required(sk, addr, AF_INET,
l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr,
AF_INET, prefixlen, l3index, flags,
cmd.tcpm_key, cmd.tcpm_keylen);
}
addr = (union tcp_md5_addr *)&sin6->sin6_addr;
/* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd()
*/ if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false)) return -EKEYREJECTED;
/* So that link locals have meaning */ if ((!sk_listener->sk_bound_dev_if || l3_slave) &&
ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
ireq->ir_iif = tcp_v6_iif(skb);
if (tsecr)
tot_len += TCPOLEN_TSTAMP_ALIGNED; if (tcp_key_is_md5(key))
tot_len += TCPOLEN_MD5SIG_ALIGNED; if (tcp_key_is_ao(key))
tot_len += tcp_ao_len_aligned(key->ao_key);
#ifdef CONFIG_MPTCP if (rst && !tcp_key_is_md5(key)) {
mrst = mptcp_reset_option(skb);
if (mrst)
tot_len += sizeof(__be32);
} #endif
buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (!buff) return;
fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif)
fl6.flowi6_oif = tcp_v6_iif(skb); else { if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
oif = skb->skb_iif;
fl6.flowi6_oif = oif;
}
if (sk) { /* unconstify the socket only to attach it to buff with care. */
skb_set_owner_edemux(buff, (struct sock *)sk);
if (sk->sk_state == TCP_TIME_WAIT)
mark = inet_twsk(sk)->tw_mark; else
mark = READ_ONCE(sk->sk_mark);
skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC);
} if (txhash) { /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */
skb_set_hash(buff, txhash, PKT_HASH_TYPE_L4);
}
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST * Underlying function will use this to retrieve the network * namespace
*/ if (sk && sk->sk_state != TCP_TIME_WAIT)
dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/ else
dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL,
tclass, priority);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS); return;
}
/* If sk not NULL, it means we did a successful lookup and incoming * route had to be correct. prequeue might have dropped our dst.
*/ if (!sk && !ipv6_unicast_destination(skb)) return;
net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; #ifdefined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
rcu_read_lock(); #endif #ifdef CONFIG_TCP_MD5SIG if (sk && sk_fullsock(sk)) { int l3index;
/* sdif set, means packet ingressed via a device * in an L3 domain and inet_iif is set to it.
*/
l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index); if (key.md5_key)
key.type = TCP_KEY_MD5;
} elseif (md5_hash_location) { int dif = tcp_v6_iif_l3_slave(skb); int sdif = tcp_v6_sdif(skb); int l3index;
/* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. * we are not loose security here: * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match.
*/
sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
NULL, 0, &ipv6h->saddr, th->source,
&ipv6h->daddr, ntohs(th->source),
dif, sdif); if (!sk1) goto out;
/* sdif set, means packet ingressed via a device * in an L3 domain and dif is set to it.
*/
l3index = tcp_v6_sdif(skb) ? dif : 0;
staticvoid tcp_v6_restore_cb(struct sk_buff *skb)
{ /* We need to move header back to the beginning if xfrm6_policy_check() * and tcp_v6_fill_cb() are going to be called again. * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
*/
memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, sizeof(struct inet6_skb_parm));
}
/* * No need to charge this sock to the relevant IPv6 refcnt debug socks count * here, tcp_create_openreq_child now does this for us, see the comment in * that function for the gory details. -acme
*/
/* It is tricky place. Until this moment IPv4 tcp worked with IPv6 icsk.icsk_af_ops. Sync it now.
*/
tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
return newsk;
}
ireq = inet_rsk(req);
if (sk_acceptq_is_full(sk)) goto exit_overflow;
if (!dst) {
dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); if (!dst) gotoexit;
}
newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit_nonewsk;
/* * No need to charge this sock to the relevant IPv6 refcnt debug socks * count here, tcp_create_openreq_child now does this for us, see the * comment in that function for the gory details. -acme
*/
/* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer().
*/ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
/* Clone native IPv6 options from listening socket (if any)
Yes, keeping reference count would be much more clever, but we make one more one thing there: reattach optmem to newsk.
*/
opt = ireq->ipv6_opt; if (!opt)
opt = rcu_dereference(np->opt); if (opt) {
opt = ipv6_dup_options(newsk, opt);
RCU_INIT_POINTER(newnp->opt, opt);
}
inet_csk(newsk)->icsk_ext_hdr_len = 0; if (opt)
inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
opt->opt_flen;
if (!tcp_rsk_used_ao(req)) { /* Copy over the MD5 key from the original socket */
key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); if (key) { constunion tcp_md5_addr *addr;
addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr; if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) goto put_and_exit;
}
} #endif #ifdef CONFIG_TCP_AO /* Copy over tcp_ao_info if any */ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6)) goto put_and_exit; /* OOM */ #endif
if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit;
*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
&found_dup_sk); if (*own_req) {
tcp_move_syn(newtp, req);
/* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) {
newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk);
consume_skb(ireq->pktopts);
ireq->pktopts = NULL; if (newnp->pktoptions)
tcp_v6_restore_cb(newnp->pktoptions);
}
} else { if (!req_unhash && found_dup_sk) { /* This code path should only be executed in the * syncookie case only
*/
bh_unlock_sock(newsk);
sock_put(newsk);
newsk = NULL;
}
}
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held.
*/
INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
{ struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct sk_buff *opt_skb = NULL; enum skb_drop_reason reason; struct tcp_sock *tp;
/* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. From backlog it always goes here. Kerboom... Fortunately, tcp_rcv_established and rcv_established handle them correctly, but it is not case with tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK
*/
if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb);
/* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled.
*/
/* Do Stevens' IPV6_PKTOPTIONS.
Yes, guys, it is the only place in our code, where we may make it not affecting IPv4. The rest of code is protocol independent, and I do not like idea to uglify IPv4.
Actually, all the idea behind IPV6_PKTOPTIONS looks not very well thought. For now we latch options, received in the last packet, enqueued by tcp. Feel free to propose better solution. --ANK (980728)
*/ if (np->rxopt.all && sk->sk_state != TCP_LISTEN)
opt_skb = skb_clone_and_charge_r(skb, sk);
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst;
1. skb was enqueued by tcp. 2. skb is added to tail of read queue, rather than out of order. 3. socket is not in passive state. 4. Finally, it really contains options, which user wants to receive.
*/
tp = tcp_sk(sk); if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
WRITE_ONCE(np->mcast_oif, tcp_v6_iif(opt_skb)); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
WRITE_ONCE(np->mcast_hops,
ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (inet6_test_bit(REPFLOW, sk))
np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
tcp_v6_restore_cb(opt_skb);
opt_skb = xchg(&np->pktoptions, opt_skb);
} else {
__kfree_skb(opt_skb);
opt_skb = xchg(&np->pktoptions, NULL);
}
}
consume_skb(opt_skb); return 0;
}
staticvoid tcp_v6_fill_cb(struct sk_buff *skb, conststruct ipv6hdr *hdr, conststruct tcphdr *th)
{ /* This is tricky: we move IP6CB at its correct location into * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because * _decode_session6() uses IP6CB(). * barrier() makes sure compiler won't play aliasing games.
*/
memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), sizeof(struct inet6_skb_parm));
barrier();
/* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here.
*/ staticint tcp_v6_init_sock(struct sock *sk)
{ struct inet_connection_sock *icsk = inet_csk(sk);
state = inet_sk_state_load(sp); if (state == TCP_LISTEN)
rx_queue = READ_ONCE(sp->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value.
*/
rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
READ_ONCE(tp->copied_seq), 0);
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.62Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.