/* No TX buffers available, try to steal the list from the * completion handler.
*/ if (unlikely(index == -1)) {
tx->dqo_tx.free_tx_qpl_buf_head =
atomic_xchg(&tx->dqo_compl.free_tx_qpl_buf_head, -1);
index = tx->dqo_tx.free_tx_qpl_buf_head;
if (unlikely(index == -1)) return index;
}
/* Remove TX buf from free list */
tx->dqo_tx.free_tx_qpl_buf_head = tx->dqo.tx_qpl_buf_next[index];
index = pkt->tx_qpl_buf_ids[0]; /* Create a linked list of buffers to be added to the free list */ for (i = 1; i < pkt->num_bufs; i++) {
tx->dqo.tx_qpl_buf_next[index] = pkt->tx_qpl_buf_ids[i];
index = pkt->tx_qpl_buf_ids[i];
}
while (true) {
s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_tx_qpl_buf_head);
/* No pending_packets available, try to steal the list from the * completion handler.
*/ if (unlikely(index == -1)) {
tx->dqo_tx.free_pending_packets =
atomic_xchg(&tx->dqo_compl.free_pending_packets, -1);
index = tx->dqo_tx.free_pending_packets;
if (unlikely(index == -1)) return NULL;
}
pending_packet = &tx->dqo.pending_packets[index];
/* Remove pending_packet from free list */
tx->dqo_tx.free_pending_packets = pending_packet->next;
pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
/* Queue sizes must be a power of 2 */
tx->mask = cfg->ring_size - 1;
tx->dqo.complq_mask = tx->mask;
/* The max number of pending packets determines the maximum number of * descriptors which maybe written to the completion queue. * * We must set the number small enough to make sure we never overrun the * completion queue.
*/
num_pending_packets = tx->dqo.complq_mask + 1;
/* Reserve space for descriptor completions, which will be reported at * most every GVE_TX_MIN_RE_INTERVAL packets.
*/
num_pending_packets -=
(tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL;
/* Each packet may have at most 2 buffer completions if it receives both * a miss and reinjection completion.
*/
num_pending_packets /= 2;
int gve_tx_alloc_rings_dqo(struct gve_priv *priv, struct gve_tx_alloc_rings_cfg *cfg)
{ struct gve_tx_ring *tx = cfg->tx; int total_queues; int err = 0; int i, j;
total_queues = cfg->qcfg->num_queues + cfg->num_xdp_rings; if (total_queues > cfg->qcfg->max_queues) {
netif_err(priv, drv, priv->dev, "Cannot alloc more than the max num of Tx rings\n"); return -EINVAL;
}
tx = kvcalloc(cfg->qcfg->max_queues, sizeof(struct gve_tx_ring),
GFP_KERNEL); if (!tx) return -ENOMEM;
for (i = 0; i < total_queues; i++) {
err = gve_tx_alloc_ring_dqo(priv, cfg, &tx[i], i); if (err) {
netif_err(priv, drv, priv->dev, "Failed to alloc tx ring=%d: err=%d\n",
i, err); goto err;
}
}
for (i = 0; i < cfg->qcfg->num_queues + cfg->qcfg->num_xdp_queues; i++)
gve_tx_free_ring_dqo(priv, &tx[i], cfg);
kvfree(tx);
cfg->tx = NULL;
}
/* Returns the number of slots available in the ring */ static u32 num_avail_tx_slots(conststruct gve_tx_ring *tx)
{
u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask;
return tx->mask - num_used;
}
/* Checks if the requested number of slots are available in the ring */ staticbool gve_has_tx_slots_available(struct gve_tx_ring *tx, u32 slots_req)
{
u32 num_avail = num_avail_tx_slots(tx);
/* Update cached TX head pointer */
tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
return num_avail_tx_slots(tx) >= slots_req;
}
staticbool gve_has_avail_slots_tx_dqo(struct gve_tx_ring *tx, int desc_count, int buf_count)
{ return gve_has_pending_packet(tx) &&
gve_has_tx_slots_available(tx, desc_count) &&
gve_has_free_tx_qpl_bufs(tx, buf_count);
}
/* Stops the queue if available descriptors is less than 'count'. * Return: 0 if stop is not required.
*/ staticint gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int desc_count, int buf_count)
{ if (likely(gve_has_avail_slots_tx_dqo(tx, desc_count, buf_count))) return 0;
/* No space, so stop the queue */
tx->stop_queue++;
netif_tx_stop_queue(tx->netdev_txq);
/* Sync with restarting queue in `gve_tx_poll_dqo()` */
mb();
/* After stopping queue, check if we can transmit again in order to * avoid TOCTOU bug.
*/ if (likely(!gve_has_avail_slots_tx_dqo(tx, desc_count, buf_count))) return -EBUSY;
/* Validates and prepares `skb` for TSO. * * Returns header length, or < 0 if invalid.
*/ staticint gve_prep_tso(struct sk_buff *skb)
{ struct tcphdr *tcp; int header_len;
u32 paylen; int err;
/* Note: HW requires MSS (gso_size) to be <= 9728 and the total length * of the TSO to be <= 262143. * * However, we don't validate these because: * - Hypervisor enforces a limit of 9K MTU * - Kernel will not produce a TSO larger than 64k
*/
if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO)) return -1;
if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) return -EINVAL;
/* Needed because we will modify header. */
err = skb_cow_head(skb, 0); if (err < 0) return err;
/* Note: HW requires that the size of a non-TSO packet be within the * range of [17, 9728]. * * We don't double check because * - We limited `netdev->min_mtu` to ETH_MIN_MTU. * - Hypervisor won't allow MTU larger than 9216.
*/
pkt->num_bufs = 0; /* Map the linear portion of skb */
{
u32 len = skb_headlen(skb);
dma_addr_t addr;
for (i = 0; i < shinfo->nr_frags; i++) { unsignedint frag_size = skb_frag_size(&shinfo->frags[i]);
num_descs += gve_num_descs_per_buf(frag_size);
}
return num_descs;
}
/* Returns true if HW is capable of sending TSO represented by `skb`. * * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers. * - The header is counted as one buffer for every single segment. * - A buffer which is split between two segments is counted for both. * - If a buffer contains both header and payload, it is counted as two buffers.
*/ staticbool gve_can_send_tso(conststruct sk_buff *skb)
{ constint max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1; conststruct skb_shared_info *shinfo = skb_shinfo(skb); constint header_len = skb_tcp_all_headers(skb); constint gso_size = shinfo->gso_size; int cur_seg_num_bufs; int prev_frag_size; int cur_seg_size; int i;
for (i = 0; i < shinfo->nr_frags; i++) { if (cur_seg_size >= gso_size) {
cur_seg_size %= gso_size;
cur_seg_num_bufs = cur_seg_size > 0;
if (prev_frag_size > GVE_TX_MAX_BUF_SIZE_DQO) { int prev_frag_remain = prev_frag_size %
GVE_TX_MAX_BUF_SIZE_DQO;
/* If the last descriptor of the previous frag * is less than cur_seg_size, the segment will * span two descriptors in the previous frag. * Since max gso size (9728) is less than * GVE_TX_MAX_BUF_SIZE_DQO, it is impossible * for the segment to span more than two * descriptors.
*/ if (prev_frag_remain &&
cur_seg_size > prev_frag_remain)
cur_seg_num_bufs++;
}
}
if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg)) returnfalse;
netdev_features_t gve_features_check_dqo(struct sk_buff *skb, struct net_device *dev,
netdev_features_t features)
{ if (skb_is_gso(skb) && !gve_can_send_tso(skb)) return features & ~NETIF_F_GSO_MASK;
return features;
}
/* Attempt to transmit specified SKB. * * Returns 0 if the SKB was transmitted or dropped. * Returns -1 if there is not currently enough space to transmit the SKB.
*/ staticint gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx, struct sk_buff *skb)
{ int num_buffer_descs; int total_num_descs;
if (skb_is_gso(skb) && unlikely(ipv6_hopopt_jumbo_remove(skb))) goto drop;
if (tx->dqo.qpl) { /* We do not need to verify the number of buffers used per * packet or per segment in case of TSO as with 2K size buffers * none of the TX packet rules would be violated. * * gve_can_send_tso() checks that each TCP segment of gso_size is * not distributed over more than 9 SKB frags..
*/
num_buffer_descs = DIV_ROUND_UP(skb->len, GVE_TX_BUF_SIZE_DQO);
} else {
num_buffer_descs = gve_num_buffer_descs_needed(skb); if (!skb_is_gso(skb)) { if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) { if (unlikely(skb_linearize(skb) < 0)) goto drop;
/* Transmit a given skb and ring the doorbell. */
netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
{ struct gve_priv *priv = netdev_priv(dev); struct gve_tx_ring *tx;
tx = &priv->tx[skb_get_queue_mapping(skb)]; if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) { /* We need to ring the txq doorbell -- we have stopped the Tx * queue for want of resources, but prior calls to gve_tx() * may have added descriptors without ringing the doorbell.
*/
gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail); return NETDEV_TX_BUSY;
}
if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more()) return NETDEV_TX_OK;
if (unlikely(is_reinjection)) { if (unlikely(pending_packet->state ==
GVE_PACKET_STATE_TIMED_OUT_COMPL)) {
net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n",
priv->dev->name, (int)compl_tag); /* Packet was already completed as a result of timeout, * so just remove from list and free pending packet.
*/
remove_from_list(tx,
&tx->dqo_compl.timed_out_completions,
pending_packet);
gve_free_pending_packet(tx, pending_packet); return;
} if (unlikely(pending_packet->state !=
GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) { /* No outstanding miss completion but packet allocated * implies packet receives a re-injection completion * without a prior miss completion. Return without * completing the packet.
*/
net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n",
priv->dev->name, (int)compl_tag); return;
}
remove_from_list(tx, &tx->dqo_compl.miss_completions,
pending_packet);
} else { /* Packet is allocated but not a pending data completion. */ if (unlikely(pending_packet->state !=
GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
net_err_ratelimited("%s: No pending data completion: %d\n",
priv->dev->name, (int)compl_tag); return;
}
}
tx->dqo_tx.completed_packet_desc_cnt += pending_packet->num_bufs;
switch (pending_packet->type) { case GVE_TX_PENDING_PACKET_DQO_SKB: if (tx->dqo.qpl)
gve_free_tx_qpl_bufs(tx, pending_packet); else
gve_unmap_packet(tx->dev, pending_packet);
(*pkts)++;
*bytes += pending_packet->skb->len;
next_index = tx->dqo_compl.miss_completions.head; while (next_index != -1) {
pending_packet = &tx->dqo.pending_packets[next_index];
next_index = pending_packet->next; /* Break early because packets should timeout in order. */ if (time_is_after_jiffies(pending_packet->timeout_jiffies)) break;
remove_from_list(tx, &tx->dqo_compl.miss_completions,
pending_packet); /* Unmap/free TX buffers and free skb but do not unallocate packet i.e. * the completion tag is not freed to ensure that the driver * can take appropriate action if a corresponding valid * completion is received later.
*/ if (tx->dqo.qpl)
gve_free_tx_qpl_bufs(tx, pending_packet); else
gve_unmap_packet(tx->dev, pending_packet);
/* This indicates the packet was dropped. */
dev_kfree_skb_any(pending_packet->skb);
pending_packet->skb = NULL;
tx->dropped_pkt++;
net_err_ratelimited("%s: No reinjection completion was received for: %d.\n",
priv->dev->name,
(int)(pending_packet - tx->dqo.pending_packets));
pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL;
pending_packet->timeout_jiffies =
jiffies +
secs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT); /* Maintain pending packet in another list so the packet can be * unallocated at a later time.
*/
add_to_list(tx, &tx->dqo_compl.timed_out_completions,
pending_packet);
}
}
next_index = tx->dqo_compl.timed_out_completions.head; while (next_index != -1) {
pending_packet = &tx->dqo.pending_packets[next_index];
next_index = pending_packet->next; /* Break early because packets should timeout in order. */ if (time_is_after_jiffies(pending_packet->timeout_jiffies)) break;
/* Limit in order to avoid blocking for too long */ while (!napi || pkt_compl_pkts < napi->weight) { struct gve_tx_compl_desc *compl_desc =
&tx->dqo.compl_ring[tx->dqo_compl.head];
u16 type;
if (compl_desc->generation == tx->dqo_compl.cur_gen_bit) break;
/* Prefetch the next descriptor. */
prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) &
tx->dqo.complq_mask]);
/* Do not read data until we own the descriptor */
dma_rmb();
type = compl_desc->type;
if (type == GVE_COMPL_TYPE_DQO_DESC) { /* This is the last descriptor fetched by HW plus one */
u16 tx_head = le16_to_cpu(compl_desc->tx_head);
tx->dqo_compl.head =
(tx->dqo_compl.head + 1) & tx->dqo.complq_mask; /* Flip the generation bit when we wrap around */
tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0;
num_descs_cleaned++;
}
if (tx->netdev_txq)
netdev_tx_completed_queue(tx->netdev_txq,
pkt_compl_pkts + miss_compl_pkts,
pkt_compl_bytes + miss_compl_bytes);
/* Return true if we still have work. */
compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
}
/* Return true if we still have work. */
compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head]; return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
}
int gve_xdp_xmit_one_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, struct xdp_frame *xdpf)
{ struct gve_tx_pending_packet_dqo *pkt;
u32 desc_idx = tx->dqo_tx.tail;
s16 completion_tag; int num_descs = 1;
dma_addr_t addr; int err;
if (unlikely(!gve_has_tx_slots_available(tx, num_descs))) return -EBUSY;
pkt = gve_alloc_pending_packet(tx); if (unlikely(!pkt)) return -EBUSY;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.