/* * Copyright (c) 2007 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. *
*/
staticvoid mlx4_en_free_frag(conststruct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_alloc *frag)
{ if (frag->page)
page_pool_put_full_page(ring->pp, frag->page, false); /* We need to clear all fields, otherwise a change of priv->log_rx_info * could lead to see garbage later in frag->page.
*/
memset(frag, 0, sizeof(*frag));
}
staticvoid mlx4_en_init_rx_desc(conststruct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, int index)
{ struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index; int possible_frags; int i;
/* Set size and memtype fields */ for (i = 0; i < priv->num_frags; i++) {
rx_desc->data[i].byte_count =
cpu_to_be32(priv->frag_info[i].frag_size);
rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
}
/* If the number of used fragments does not fill up the ring stride, * remaining (unused) fragments must be padded with null address/size
* and a special memory key */
possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE; for (i = priv->num_frags; i < possible_frags; i++) {
rx_desc->data[i].byte_count = 0;
rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD);
rx_desc->data[i].addr = 0;
}
}
/* Function not in fast-path */ staticint mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
{ struct mlx4_en_rx_ring *ring; int ring_ind; int buf_ind; int new_size;
for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) { for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
ring = priv->rx_ring[ring_ind];
if (mlx4_en_prepare_rx_desc(priv, ring,
ring->actual_size,
GFP_KERNEL)) { if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
en_err(priv, "Failed to allocate enough rx buffers\n"); return -ENOMEM;
} else {
new_size = rounddown_pow_of_two(ring->actual_size);
en_warn(priv, "Only %d buffers allocated reducing ring size to %d\n",
ring->actual_size, new_size); goto reduce_rings;
}
}
ring->actual_size++;
ring->prod++;
}
} return 0;
reduce_rings: for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
ring = priv->rx_ring[ring_ind]; while (ring->actual_size > new_size) {
ring->actual_size--;
ring->prod--;
mlx4_en_free_rx_desc(priv, ring, ring->actual_size);
}
}
return 0;
}
staticvoid mlx4_en_free_rx_buf(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring)
{ int index;
int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
{ struct mlx4_en_rx_ring *ring; int i; int ring_ind; int err; int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
DS_SIZE * priv->num_frags);
for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
ring = priv->rx_ring[ring_ind];
/* Initialize all descriptors */ for (i = 0; i < ring->size; i++)
mlx4_en_init_rx_desc(priv, ring, i);
}
err = mlx4_en_fill_rx_buffers(priv); if (err) goto err_buffers;
for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
ring = priv->rx_ring[ring_ind];
/* We recover from out of memory by scheduling our napi poll * function (mlx4_en_process_cq), which tries to allocate * all missing RX buffers (call to mlx4_en_refill_rx_buffers).
*/ void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv)
{ int ring;
if (!priv->port_up) return;
for (ring = 0; ring < priv->rx_ring_num; ring++) { if (mlx4_en_is_ring_empty(priv->rx_ring[ring])) {
local_bh_disable();
napi_schedule(&priv->rx_cq[ring]->napi);
local_bh_enable();
}
}
}
/* Try to batch allocations, but not too much. */ if (missing < 8) return; do { if (mlx4_en_prepare_rx_desc(priv, ring,
ring->prod & ring->size_mask,
GFP_ATOMIC | __GFP_MEMALLOC)) break;
ring->prod++;
} while (likely(--missing));
mlx4_en_update_rx_prod_db(ring);
}
/* When hardware doesn't strip the vlan, we need to calculate the checksum * over it and add it to the hardware's checksum calculation
*/ staticinline __wsum get_fixed_vlan_csum(__wsum hw_checksum, struct vlan_hdr *vlanh)
{ return csum_add(hw_checksum, *(__wsum *)vlanh);
}
/* Although the stack expects checksum which doesn't include the pseudo * header, the HW adds it. To address that, we are subtracting the pseudo * header checksum from the checksum value provided by the HW.
*/ staticint get_fixed_ipv4_csum(__wsum hw_checksum, struct sk_buff *skb, struct iphdr *iph)
{
__u16 length_for_csum = 0;
__wsum csum_pseudo_header = 0;
__u8 ipproto = iph->protocol;
/* We reach this function only after checking that any of * the (IPv4 | IPv6) bits are set in cqe->status.
*/ staticint check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va,
netdev_features_t dev_features)
{
__wsum hw_checksum = 0; void *hdr;
/* CQE csum doesn't cover padding octets in short ethernet * frames. And the pad field is appended prior to calculating * and appending the FCS field. * * Detecting these padded frames requires to verify and parse * IP headers, so we simply force all those small frames to skip * checksum complete.
*/ if (short_frame(skb->len)) return -EINVAL;
/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx * descriptor offset can be deduced from the CQE index instead of
* reading 'cqe->index' */
index = cq->mcq.cons_index & ring->size_mask;
cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
/* Process all completed CQEs */ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
cq->mcq.cons_index & cq->size)) { struct mlx4_en_rx_alloc *frags; enum pkt_hash_types hash_type; struct sk_buff *skb; unsignedint length; int ip_summed; void *va; int nr;
frags = ring->rx_info + (index << priv->log_rx_info);
va = page_address(frags[0].page) + frags[0].page_offset;
net_prefetchw(va); /* * make sure we read the CQE after we read the ownership bit
*/
dma_rmb();
/* Drop packet on bad receive or bad checksum */ if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
MLX4_CQE_OPCODE_ERROR)) {
en_err(priv, "CQE completed in error - vendor syndrome:%d syndrome:%d\n",
((struct mlx4_err_cqe *)cqe)->vendor_err_syndrome,
((struct mlx4_err_cqe *)cqe)->syndrome); goto next;
} if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) {
en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n"); goto next;
}
/* Check if we need to drop the packet if SRIOV is not enabled * and not performing the selftest or flb disabled
*/ if (priv->flags & MLX4_EN_FLAG_RX_FILTER_NEEDED) { conststruct ethhdr *ethh = va;
dma_addr_t dma; /* Get pointer to first fragment since we haven't * skb yet and cast it to ethhdr struct
*/
dma = page_pool_get_dma_addr(frags[0].page);
dma += frags[0].page_offset;
dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
DMA_FROM_DEVICE);
/* Drop the packet, since HW loopback-ed it */
mac_hash = ethh->h_source[MLX4_EN_MAC_HASH_IDX];
bucket = &priv->mac_hash[mac_hash];
hlist_for_each_entry_rcu_bh(entry, bucket, hlist) { if (ether_addr_equal_64bits(entry->mac,
ethh->h_source)) goto next;
}
}
}
if (unlikely(priv->validate_loopback)) {
validate_loopback(priv, va); goto next;
}
/* * Packet is OK - process it.
*/
length = be32_to_cpu(cqe->byte_cnt);
length -= ring->fcs_del;
/* A bpf program gets first chance to drop the packet. It may * read bytes but not past the end of the frag.
*/ if (xdp_prog) {
dma_addr_t dma; void *orig_data;
u32 act;
if (likely(dev->features & NETIF_F_RXCSUM)) { /* TODO: For IP non TCP/UDP packets when csum complete is * not an option (not supported or any other reason) we can * actually check cqe IPOK status bit and report * CHECKSUM_UNNECESSARY rather than CHECKSUM_NONE
*/ if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
MLX4_CQE_STATUS_UDP)) &&
(cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
cqe->checksum == cpu_to_be16(0xffff)) { bool l2_tunnel;
/* If we used up all the quota - we're probably not done yet... */ if (done == budget || !clean_complete) { int cpu_curr;
/* in case we got here because of !clean_complete */
done = budget;
cpu_curr = smp_processor_id();
if (likely(cpumask_test_cpu(cpu_curr, cq->aff_mask))) return budget;
/* Current cpu is not according to smp_irq_affinity - * probably affinity changed. Need to stop this NAPI * poll, and restart it on the right CPU. * Try to avoid returning a too small value (like 0), * to not fool net_rx_action() and its netdev_budget
*/ if (done)
done--;
} /* Done for now */ if (likely(napi_complete_done(napi, done)))
mlx4_en_arm_cq(priv, cq); return done;
}
void mlx4_en_calc_rx_buf(struct net_device *dev)
{ struct mlx4_en_priv *priv = netdev_priv(dev); int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu); int i = 0;
/* bpf requires buffers to be set up as 1 packet per page. * This only works when num_frags == 1.
*/ if (priv->tx_ring_num[TX_XDP]) {
priv->frag_info[0].frag_size = eff_mtu; /* This will gain efficient xdp frame recycling at the * expense of more costly truesize accounting
*/
priv->frag_info[0].frag_stride = PAGE_SIZE;
priv->dma_dir = DMA_BIDIRECTIONAL;
priv->rx_headroom = XDP_PACKET_HEADROOM;
i = 1;
} else { int frag_size_max = 2048, buf_size = 0;
/* should not happen, right ? */ if (eff_mtu > PAGE_SIZE + (MLX4_EN_MAX_RX_FRAGS - 1) * 2048)
frag_size_max = PAGE_SIZE;
while (buf_size < eff_mtu) { int frag_stride, frag_size = eff_mtu - buf_size; int pad, nb;
if (i < MLX4_EN_MAX_RX_FRAGS - 1)
frag_size = min(frag_size, frag_size_max);
priv->frag_info[i].frag_size = frag_size;
frag_stride = ALIGN(frag_size, SMP_CACHE_BYTES); /* We can only pack 2 1536-bytes frames in on 4K page * Therefore, each frame would consume more bytes (truesize)
*/
nb = PAGE_SIZE / frag_stride;
pad = (PAGE_SIZE - nb * frag_stride) / nb;
pad &= ~(SMP_CACHE_BYTES - 1);
priv->frag_info[i].frag_stride = frag_stride + pad;
if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
en_info(priv, "Setting RSS context tunnel type to RSS on inner headers\n");
rss_mask |= MLX4_RSS_BY_INNER_HEADERS;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.