/* The following ports, 16385, 18634, 18635, are registered with IANA as * the ports to be used for RDS over TCP and UDP. Currently, only RDS over * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept * to ensure compatibility with older RDS modules. Those ports are defined * in each transport's header file.
*/ #define RDS_PORT 18634
/* * This is how we will track the connection state: * A connection is always in one of the following * states. Updates to the state are atomic and imply * a memory barrier.
*/ enum {
RDS_CONN_DOWN = 0,
RDS_CONN_CONNECTING,
RDS_CONN_DISCONNECTING,
RDS_CONN_UP,
RDS_CONN_RESETTING,
RDS_CONN_ERROR,
};
/* Max number of multipaths per RDS connection. Must be a power of 2 */ #define RDS_MPATH_WORKERS 8 #define RDS_MPATH_HASH(rs, n) (jhash_1word(ntohs((rs)->rs_bound_port), \
(rs)->rs_hash_initval) & ((n) - 1))
/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping * probe to exchange control information before establishing a connection. * Currently the control information that is exchanged is the number of * supported paths. If the peer is a legacy (older kernel revision) peer, * it would return a pong message without additional control information * that would then alert the sender that the peer was an older rev.
*/ #define RDS_FLAG_PROBE_PORT 1 #define RDS_HS_PROBE(sport, dport) \
((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \
(sport == 0 && dport == RDS_FLAG_PROBE_PORT)) /* * Maximum space available for extension headers.
*/ #define RDS_HEADER_EXT_SPACE 16
/* * Reserved - indicates end of extensions
*/ #define RDS_EXTHDR_NONE 0
/* * This extension header is included in the very * first message that is sent on a new connection, * and identifies the protocol level. This will help * rolling updates if a future change requires breaking * the protocol. * NB: This is no longer true for IB, where we do a version * negotiation during the connection setup phase (protocol * version information is included in the RDMA CM private data).
*/ #define RDS_EXTHDR_VERSION 1 struct rds_ext_header_version {
__be32 h_version;
};
/* * This extension header is included in the RDS message * chasing an RDMA operation.
*/ #define RDS_EXTHDR_RDMA 2 struct rds_ext_header_rdma {
__be32 h_rdma_rkey;
};
/* * This extension header tells the peer about the * destination <R_Key,offset> of the requested RDMA * operation.
*/ #define RDS_EXTHDR_RDMA_DEST 3 struct rds_ext_header_rdma_dest {
__be32 h_rdma_rkey;
__be32 h_rdma_offset;
};
/* Extension header announcing number of paths. * Implicit length = 2 bytes.
*/ #define RDS_EXTHDR_NPATHS 5 #define RDS_EXTHDR_GEN_NUM 6
/* * m_sock_item and m_conn_item are on lists that are serialized under * conn->c_lock. m_sock_item has additional meaning in that once it is empty * the message will not be put back on the retransmit list after being sent. * messages that are canceled while being sent rely on this. * * m_inc is used by loopback so that it can pass an incoming message straight * back up into the rx path. It embeds a wire header which is also used by * the send path, which is kind of awkward. * * m_sock_item indicates the message's presence on a socket's send or receive * queue. m_rs will point to that socket. * * m_daddr is used by cancellation to prune messages to a given destination. * * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock * nesting. As paths iterate over messages on a sock, or conn, they must * also lock the conn, or sock, to remove the message from those lists too. * Testing the flag to determine if the message is still on the lists lets * us avoid testing the list_head directly. That means each path can use * the message's list_head to keep it on a local list while juggling locks * without confusing the other path. * * m_ack_seq is an optional field set by transports who need a different * sequence number range to invalidate. They can use this in a callback * that they pass to rds_send_drop_acked() to see if each message has been * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't * had ack_seq set yet.
*/ #define RDS_MSG_ON_SOCK 1 #define RDS_MSG_ON_CONN 2 #define RDS_MSG_HAS_ACK_SEQ 3 #define RDS_MSG_ACK_REQUIRED 4 #define RDS_MSG_RETRANSMITTED 5 #define RDS_MSG_MAPPED 6 #define RDS_MSG_PAGEVEC 7 #define RDS_MSG_FLUSH 8
/* * The RDS notifier is used (optionally) to tell the application about * completed RDMA operations. Rather than keeping the whole rds message * around on the queue, we allocate a small notifier that is put on the * socket's notifier_list. Notifications are delivered to the application * through control messages.
*/ struct rds_notifier { struct list_head n_list;
uint64_t n_user_token; int n_status;
};
/* Available as part of RDS core, so doesn't need to participate * in get_preferred transport etc
*/ #define RDS_TRANS_LOOP 3
/** * struct rds_transport - transport specific behavioural hooks * * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send * part of a message. The caller serializes on the send_sem so this * doesn't need to be reentrant for a given conn. The header must be * sent before the data payload. .xmit must be prepared to send a * message with no data payload. .xmit should return the number of * bytes that were sent down the connection, including header bytes. * Returning 0 tells the caller that it doesn't need to perform any * additional work now. This is usually the case when the transport has * filled the sending queue for its connection and will handle * triggering the rds thread to continue the send when space becomes * available. Returning -EAGAIN tells the caller to retry the send * immediately. Returning -ENOMEM tells the caller to retry the send at * some point in the future. * * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once * it returns the connection can not call rds_recv_incoming(). * This will only be called once after conn_connect returns * non-zero success and will The caller serializes this with * the send and connecting paths (xmit_* and conn_*). The * transport is responsible for other serialization, including * rds_recv_incoming(). This is called in process context but * should try hard not to block.
*/
/* Bind hash table key length. It is the sum of the size of a struct * in6_addr, a scope_id and a port.
*/ #define RDS_BOUND_KEY_LEN \
(sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16))
struct rds_sock { struct sock rs_sk;
u64 rs_user_addr;
u64 rs_user_bytes;
/* * bound_addr used for both incoming and outgoing, no INADDR_ANY * support.
*/ struct rhash_head rs_bound_node;
u8 rs_bound_key[RDS_BOUND_KEY_LEN]; struct sockaddr_in6 rs_bound_sin6; #define rs_bound_addr rs_bound_sin6.sin6_addr #define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3] #define rs_bound_port rs_bound_sin6.sin6_port #define rs_bound_scope_id rs_bound_sin6.sin6_scope_id struct in6_addr rs_conn_addr; #define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3]
__be16 rs_conn_port; struct rds_transport *rs_transport;
/* * rds_sendmsg caches the conn it used the last time around. * This helps avoid costly lookups.
*/ struct rds_connection *rs_conn;
/* flag indicating we were congested or not */ int rs_congested; /* seen congestion (ENOBUFS) when sending? */ int rs_seen_congestion;
/* rs_lock protects all these adjacent members before the newline */
spinlock_t rs_lock; struct list_head rs_send_queue;
u32 rs_snd_bytes; int rs_rcv_bytes; struct list_head rs_notify_queue; /* currently used for failed RDMAs */
/* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask * to decide whether the application should be woken up. * If not set, we use rs_cong_track to find out whether a cong map * update arrived.
*/
uint64_t rs_cong_mask;
uint64_t rs_cong_notify; struct list_head rs_cong_list; unsignedlong rs_cong_track;
/* * rs_recv_lock protects the receive queue, and is * used to serialize with rds_release.
*/
rwlock_t rs_recv_lock; struct list_head rs_recv_queue;
/* just for stats reporting */ struct list_head rs_item;
/* these have their own lock */
spinlock_t rs_rdma_lock; struct rb_root rs_rdma_keys;
/* Socket options - in case there will be more */ unsignedchar rs_recverr,
rs_cong_monitor;
u32 rs_hash_initval;
/* * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value * to account for overhead. We don't account for overhead, we just apply * the number of payload bytes to the specified value.
*/ staticinlineint rds_sk_sndbuf(struct rds_sock *rs)
{ return rds_rs_to_sk(rs)->sk_sndbuf / 2;
} staticinlineint rds_sk_rcvbuf(struct rds_sock *rs)
{ return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.