/* * Copyright 2008-2010 Cisco Systems, Inc. All rights reserved. * Copyright 2007 Nuova Systems, Inc. All rights reserved. * * This program is free software; you may redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2 of the License. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. *
*/
#define ENIC_LARGE_PKT_THRESHOLD 1000 #define ENIC_MAX_COALESCE_TIMERS 10 /* Interrupt moderation table, which will be used to decide the * coalescing timer values * {rx_rate in Mbps, mapping percentage of the range}
*/ staticstruct enic_intr_mod_table mod_table[ENIC_MAX_COALESCE_TIMERS + 1] = {
{4000, 0},
{4400, 10},
{5060, 20},
{5230, 30},
{5540, 40},
{5820, 50},
{6120, 60},
{6435, 70},
{6745, 80},
{7000, 90},
{0xFFFFFFFF, 100}
};
/* This table helps the driver to pick different ranges for rx coalescing * timer depending on the link speed.
*/ staticstruct enic_intr_mod_range mod_range[ENIC_MAX_LINK_SPEEDS] = {
{0, 0}, /* 0 - 4 Gbps */
{0, 3}, /* 4 - 10 Gbps */
{3, 6}, /* 10+ Gbps */
};
staticvoid enic_init_affinity_hint(struct enic *enic)
{ int numa_node = dev_to_node(&enic->pdev->dev); int i;
for (i = 0; i < enic->intr_count; i++) { if (enic_is_err_intr(enic, i) || enic_is_notify_intr(enic, i) ||
(cpumask_available(enic->msix[i].affinity_mask) &&
!cpumask_empty(enic->msix[i].affinity_mask))) continue; if (zalloc_cpumask_var(&enic->msix[i].affinity_mask,
GFP_KERNEL))
cpumask_set_cpu(cpumask_local_spread(i, numa_node),
enic->msix[i].affinity_mask);
}
}
staticvoid enic_free_affinity_hint(struct enic *enic)
{ int i;
for (i = 0; i < enic->intr_count; i++) { if (enic_is_err_intr(enic, i) || enic_is_notify_intr(enic, i)) continue;
free_cpumask_var(enic->msix[i].affinity_mask);
}
}
staticvoid enic_set_affinity_hint(struct enic *enic)
{ int i; int err;
for (i = 0; i < enic->intr_count; i++) { if (enic_is_err_intr(enic, i) ||
enic_is_notify_intr(enic, i) ||
!cpumask_available(enic->msix[i].affinity_mask) ||
cpumask_empty(enic->msix[i].affinity_mask)) continue;
err = irq_update_affinity_hint(enic->msix_entry[i].vector,
enic->msix[i].affinity_mask); if (err)
netdev_warn(enic->netdev, "irq_update_affinity_hint failed, err %d\n",
err);
}
for (i = 0; i < enic->wq_count; i++) { int wq_intr = enic_msix_wq_intr(enic, i);
if (cpumask_available(enic->msix[wq_intr].affinity_mask) &&
!cpumask_empty(enic->msix[wq_intr].affinity_mask))
netif_set_xps_queue(enic->netdev,
enic->msix[wq_intr].affinity_mask,
i);
}
}
staticvoid enic_unset_affinity_hint(struct enic *enic)
{ int i;
for (i = 0; i < enic->intr_count; i++)
irq_update_affinity_hint(enic->msix_entry[i].vector, NULL);
}
switch (vlan_get_protocol(skb)) { case htons(ETH_P_IPV6): if (!(enic->vxlan.flags & ENIC_VXLAN_OUTER_IPV6)) goto out;
proto = ipv6_hdr(skb)->nexthdr; break; case htons(ETH_P_IP):
proto = ip_hdr(skb)->protocol; break; default: goto out;
}
switch (eth->h_proto) { case ntohs(ETH_P_IPV6): if (!(enic->vxlan.flags & ENIC_VXLAN_INNER_IPV6)) goto out;
fallthrough; case ntohs(ETH_P_IP): break; default: goto out;
}
if (proto == IPPROTO_UDP) {
udph = udp_hdr(skb);
port = be16_to_cpu(udph->dest);
}
/* HW supports offload of only one UDP port. Remove CSUM and GSO MASK * for other UDP port tunnels
*/ if (port != enic->vxlan.vxlan_udp_port_number) goto out;
return features;
out: return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
}
int enic_is_dynamic(struct enic *enic)
{ return enic->pdev->device == PCI_DEVICE_ID_CISCO_VIC_ENET_DYN;
}
for (i = 0; i < enic->wq_count; i++) {
error_status = vnic_wq_error_status(&enic->wq[i].vwq);
err |= error_status; if (error_status)
netdev_err(enic->netdev, "WQ[%d] error_status %d\n",
i, error_status);
}
for (i = 0; i < enic->rq_count; i++) {
error_status = vnic_rq_error_status(&enic->rq[i].vrq);
err |= error_status; if (error_status)
netdev_err(enic->netdev, "RQ[%d] error_status %d\n",
i, error_status);
}
if (mtu && mtu != enic->port_mtu) {
enic->port_mtu = mtu; if (enic_is_dynamic(enic) || enic_is_sriov_vf(enic)) {
mtu = max_t(int, ENIC_MIN_MTU,
min_t(int, ENIC_MAX_MTU, mtu)); if (mtu != netdev->mtu)
schedule_work(&enic->change_mtu_work);
} else { if (mtu < netdev->mtu)
netdev_warn(netdev, "interface MTU (%d) set higher " "than switch port MTU (%d)\n",
netdev->mtu, mtu);
}
}
}
staticvoid enic_set_rx_coal_setting(struct enic *enic)
{ unsignedint speed; int index = -1; struct enic_rx_coal *rx_coal = &enic->rx_coalesce_setting;
/* 1. Read the link speed from fw * 2. Pick the default range for the speed * 3. Update it in enic->rx_coalesce_setting
*/
speed = vnic_dev_port_speed(enic->vdev); if (speed > ENIC_LINK_SPEED_10G)
index = ENIC_LINK_40G_INDEX; elseif (speed > ENIC_LINK_SPEED_4G)
index = ENIC_LINK_10G_INDEX; else
index = ENIC_LINK_4G_INDEX;
/* Start with the value provided by UCSM */ for (index = 0; index < enic->rq_count; index++)
enic->cq[index].cur_rx_coal_timeval =
enic->config.intr_timer_usec;
rx_coal->use_adaptive_rx_coalesce = 1;
}
staticvoid enic_link_check(struct enic *enic)
{ int link_status = vnic_dev_link_status(enic->vdev); int carrier_ok = netif_carrier_ok(enic->netdev);
/* With MSI, there is no sharing of interrupts, so this is * our interrupt and there is no need to ack it. The device * is not providing per-vector masking, so the OS will not * write to PCI config space to mask/unmask the interrupt. * We're using mask_on_assertion for MSI, so the device * automatically masks the interrupt when the interrupt is * generated. Later, when exiting polling, the interrupt * will be unmasked (see enic_poll). * * Also, the device uses the same PCIe Traffic Class (TC) * for Memory Write data and MSI, so there are no ordering * issues; the MSI will always arrive at the Root Complex * _after_ corresponding Memory Writes (i.e. descriptor * writes).
*/
/* Queue the main skb fragment. The fragments are no larger * than max MTU(9000)+ETH_HDR_LEN(14) bytes, which is less * than WQ_ENET_MAX_DESC_LEN length. So only one descriptor * per fragment is queued.
*/
enic_queue_wq_desc(wq, skb, dma_addr, head_len, vlan_tag_insert,
vlan_tag, eop, loopback);
if (!eop)
err = enic_queue_wq_skb_cont(enic, wq, skb, len_left, loopback);
/* The enic_queue_wq_desc() above does not do HW checksum */
enic->wq[wq->index].stats.csum_none++;
enic->wq[wq->index].stats.packets++;
enic->wq[wq->index].stats.bytes += skb->len;
/* Queue the main skb fragment. The fragments are no larger * than max MTU(9000)+ETH_HDR_LEN(14) bytes, which is less * than WQ_ENET_MAX_DESC_LEN length. So only one descriptor * per fragment is queued.
*/
enic_queue_wq_desc_csum_l4(wq, skb, dma_addr, head_len, csum_offset,
hdr_len, vlan_tag_insert, vlan_tag, eop,
loopback);
if (!eop)
err = enic_queue_wq_skb_cont(enic, wq, skb, len_left, loopback);
staticvoid enic_preload_tcp_csum(struct sk_buff *skb)
{ /* Preload TCP csum field with IP pseudo hdr calculated * with IP length set to zero. HW will later add in length * to each TCP segment resulting from the TSO.
*/
if (skb->len <= 0) {
dev_kfree_skb_any(skb);
enic->wq[wq->index].stats.null_pkt++; return NETDEV_TX_OK;
}
txq = netdev_get_tx_queue(netdev, txq_map);
/* Non-TSO sends must fit within ENIC_NON_TSO_MAX_DESC descs, * which is very likely. In the off chance it's going to take * more than * ENIC_NON_TSO_MAX_DESC, linearize the skb.
*/
if (vnic_wq_desc_avail(wq) <
skb_shinfo(skb)->nr_frags + ENIC_DESC_MAX_SPLITS) {
netif_tx_stop_queue(txq); /* This is a hard error, log it */
netdev_err(netdev, "BUG! Tx ring full when queue awake!\n");
spin_unlock(&enic->wq[txq_map].lock);
enic->wq[wq->index].stats.desc_full_awake++; return NETDEV_TX_BUSY;
}
if (enic_queue_wq_skb(enic, wq, skb)) goto error;
if (vnic_wq_desc_avail(wq) < MAX_SKB_FRAGS + ENIC_DESC_MAX_SPLITS) {
netif_tx_stop_queue(txq);
enic->wq[wq->index].stats.stopped++;
}
skb_tx_timestamp(skb); if (!netdev_xmit_more() || netif_xmit_stopped(txq))
vnic_wq_doorbell(wq);
err = enic_dev_stats_dump(enic, &stats); /* return only when dma_alloc_coherent fails in vnic_dev_stats_dump * For other failures, like devcmd failure, we return previously * recorded stats.
*/ if (err == -ENOMEM) return;
if (vf == PORT_SELF_VF) { /* Special case handling: mac came from IFLA_VF_MAC */ if (!is_zero_ether_addr(prev_pp.vf_mac))
memcpy(pp->mac_addr, prev_pp.vf_mac, ETH_ALEN);
if (is_zero_ether_addr(netdev->dev_addr))
eth_hw_addr_random(netdev);
} else { /* SR-IOV VF: get mac from adapter */
ENIC_DEVCMD_PROXY_BY_INDEX(vf, err, enic,
vnic_dev_get_mac_addr, pp->mac_addr); if (err) {
netdev_err(netdev, "Error getting mac for vf %d\n", vf);
memcpy(pp, &prev_pp, sizeof(*pp)); return enic_dev_status_to_errno(err);
}
}
err = enic_process_set_pp_request(enic, vf, &prev_pp, &restore_pp); if (err) { if (restore_pp) { /* Things are still the way they were: Implicit * DISASSOCIATE failed
*/
memcpy(pp, &prev_pp, sizeof(*pp));
} else {
memset(pp, 0, sizeof(*pp)); if (vf == PORT_SELF_VF)
eth_hw_addr_set(netdev, zero_addr);
}
} else { /* Set flag to indicate that the port assoc/disassoc * request has been sent out to fw
*/
pp->set |= ENIC_PORT_REQUEST_APPLIED;
/* If DISASSOCIATE, clean up all assigned/saved macaddresses */ if (pp->request == PORT_REQUEST_DISASSOCIATE) {
eth_zero_addr(pp->mac_addr); if (vf == PORT_SELF_VF)
eth_hw_addr_set(netdev, zero_addr);
}
}
if (vf == PORT_SELF_VF)
eth_zero_addr(pp->vf_mac);
/* Buffer allocation failed. Stay in polling * mode so we can try to fill the ring again.
*/
if (err)
rq_work_done = rq_work_to_do; if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce) /* Call the function which refreshes the intr coalescing timer * value based on the traffic.
*/
enic_calc_int_moderation(enic, &enic->rq[0].vrq);
if ((rq_work_done < budget) && napi_complete_done(napi, rq_work_done)) {
/* Some work done, but not enough to stay in polling, * exit polling
*/
staticvoid enic_set_rx_cpu_rmap(struct enic *enic)
{ int i, res;
if (vnic_dev_get_intr_mode(enic->vdev) == VNIC_DEV_INTR_MODE_MSIX) {
enic->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(enic->rq_count); if (unlikely(!enic->netdev->rx_cpu_rmap)) return; for (i = 0; i < enic->rq_count; i++) {
res = irq_cpu_rmap_add(enic->netdev->rx_cpu_rmap,
enic->msix_entry[i].vector); if (unlikely(res)) {
enic_free_rx_cpu_rmap(enic); return;
}
}
}
}
/* Buffer allocation failed. Stay in polling mode * so we can try to fill the ring again.
*/
if (err)
work_done = work_to_do; if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce) /* Call the function which refreshes the intr coalescing timer * value based on the traffic.
*/
enic_calc_int_moderation(enic, &enic->rq[rq].vrq);
if ((work_done < budget) && napi_complete_done(napi, work_done)) {
/* Some work done, but not enough to stay in polling, * exit polling
*/
enic_free_rx_cpu_rmap(enic); switch (vnic_dev_get_intr_mode(enic->vdev)) { case VNIC_DEV_INTR_MODE_INTX:
free_irq(enic->pdev->irq, netdev); break; case VNIC_DEV_INTR_MODE_MSI:
free_irq(enic->pdev->irq, enic); break; case VNIC_DEV_INTR_MODE_MSIX: for (i = 0; i < enic->intr_count; i++) if (enic->msix[i].requested)
free_irq(enic->msix_entry[i].vector,
enic->msix[i].devid); break; default: break;
}
}
staticint enic_request_intr(struct enic *enic)
{ struct net_device *netdev = enic->netdev; unsignedint i, intr; int err = 0;
switch (vnic_dev_get_intr_mode(enic->vdev)) { case VNIC_DEV_INTR_MODE_INTX: case VNIC_DEV_INTR_MODE_MSI:
synchronize_irq(enic->pdev->irq); break; case VNIC_DEV_INTR_MODE_MSIX: for (i = 0; i < enic->intr_count; i++)
synchronize_irq(enic->msix_entry[i].vector); break; default: break;
}
}
staticint enic_dev_notify_set(struct enic *enic)
{ int err;
spin_lock_bh(&enic->devcmd_lock); switch (vnic_dev_get_intr_mode(enic->vdev)) { case VNIC_DEV_INTR_MODE_INTX:
err = vnic_dev_notify_set(enic->vdev, ENIC_LEGACY_NOTIFY_INTR); break; case VNIC_DEV_INTR_MODE_MSIX:
err = vnic_dev_notify_set(enic->vdev,
enic_msix_notify_intr(enic)); break; default:
err = vnic_dev_notify_set(enic->vdev, -1 /* no intr */); break;
}
spin_unlock_bh(&enic->devcmd_lock);
return err;
}
staticvoid enic_notify_timer_start(struct enic *enic)
{ switch (vnic_dev_get_intr_mode(enic->vdev)) { case VNIC_DEV_INTR_MODE_MSI:
mod_timer(&enic->notify_timer, jiffies); break; default: /* Using intr for notification for INTx/MSI-X */ break;
}
}
err = enic_request_intr(enic); if (err) {
netdev_err(netdev, "Unable to request irq.\n"); return err;
}
enic_init_affinity_hint(enic);
enic_set_affinity_hint(enic);
err = enic_dev_notify_set(enic); if (err) {
netdev_err(netdev, "Failed to alloc notify buffer, aborting.\n"); goto err_out_free_intr;
}
for (i = 0; i < enic->rq_count; i++) { /* create a page pool for each RQ */
pp_params.napi = &enic->napi[i];
pp_params.queue_idx = i;
enic->rq[i].pool = page_pool_create(&pp_params); if (IS_ERR(enic->rq[i].pool)) {
err = PTR_ERR(enic->rq[i].pool);
enic->rq[i].pool = NULL; goto err_out_free_rq;
}
/* enable rq before updating rq desc */
vnic_rq_enable(&enic->rq[i].vrq);
vnic_rq_fill(&enic->rq[i].vrq, enic_rq_alloc_buf); /* Need at least one buffer on ring to get going */ if (vnic_rq_desc_used(&enic->rq[i].vrq) == 0) {
netdev_err(netdev, "Unable to alloc receive buffers\n");
err = -ENOMEM; goto err_out_free_rq;
}
}
for (i = 0; i < enic->wq_count; i++)
vnic_wq_enable(&enic->wq[i].vwq);
if (!enic_is_dynamic(enic) && !enic_is_sriov_vf(enic))
enic_dev_add_station_addr(enic);
enic_set_rx_mode(netdev);
netif_tx_wake_all_queues(netdev);
for (i = 0; i < enic->rq_count; i++)
napi_enable(&enic->napi[i]);
if (vnic_dev_get_intr_mode(enic->vdev) == VNIC_DEV_INTR_MODE_MSIX) for (i = 0; i < enic->wq_count; i++)
napi_enable(&enic->napi[enic_cq_wq(enic, i)]);
enic_dev_enable(enic);
for (i = 0; i < enic->intr_count; i++)
vnic_intr_unmask(&enic->intr[i]);
for (i = 0; i < enic->rq_count; i++)
napi_disable(&enic->napi[i]);
netif_carrier_off(netdev); if (vnic_dev_get_intr_mode(enic->vdev) == VNIC_DEV_INTR_MODE_MSIX) for (i = 0; i < enic->wq_count; i++)
napi_disable(&enic->napi[enic_cq_wq(enic, i)]);
netif_tx_disable(netdev);
if (!enic_is_dynamic(enic) && !enic_is_sriov_vf(enic))
enic_dev_del_station_addr(enic);
for (i = 0; i < enic->wq_count; i++) {
err = vnic_wq_disable(&enic->wq[i].vwq); if (err) return err;
} for (i = 0; i < enic->rq_count; i++) {
err = vnic_rq_disable(&enic->rq[i].vrq); if (err) return err;
}
for (i = 0; i < enic->wq_count; i++)
vnic_wq_clean(&enic->wq[i].vwq, enic_free_wq_buf); for (i = 0; i < enic->rq_count; i++) {
vnic_rq_clean(&enic->rq[i].vrq, enic_free_rq_buf);
page_pool_destroy(enic->rq[i].pool);
enic->rq[i].pool = NULL;
} for (i = 0; i < enic->cq_count; i++)
vnic_cq_clean(&enic->cq[i]); for (i = 0; i < enic->intr_count; i++)
vnic_intr_clean(&enic->intr[i]);
return 0;
}
staticint _enic_change_mtu(struct net_device *netdev, int new_mtu)
{ bool running = netif_running(netdev); int err = 0;
ASSERT_RTNL(); if (running) {
err = enic_stop(netdev); if (err) return err;
}
WRITE_ONCE(netdev->mtu, new_mtu);
if (running) {
err = enic_open(netdev); if (err) return err;
}
switch (vnic_dev_get_intr_mode(vdev)) { case VNIC_DEV_INTR_MODE_MSIX: for (i = 0; i < enic->rq_count; i++) {
intr = enic_msix_rq_intr(enic, i);
enic_isr_msix(enic->msix_entry[intr].vector,
&enic->napi[i]);
}
for (i = 0; i < enic->wq_count; i++) {
intr = enic_msix_wq_intr(enic, i);
enic_isr_msix(enic->msix_entry[intr].vector,
&enic->napi[enic_cq_wq(enic, i)]);
}
break; case VNIC_DEV_INTR_MODE_MSI:
enic_isr_msi(enic->pdev->irq, enic); break; case VNIC_DEV_INTR_MODE_INTX:
enic_isr_legacy(enic->pdev->irq, netdev); break; default: break;
}
} #endif
staticint enic_dev_wait(struct vnic_dev *vdev, int (*start)(struct vnic_dev *, int), int (*finished)(struct vnic_dev *, int *), int arg)
{ unsignedlong time; int done; int err;
/* Next try INTx * * We need 3 INTRs * (the first INTR is used for WQ/RQ) * (the second INTR is used for WQ/RQ errors) * (the last INTR is used for notifications)
*/
for (i = 0; i < enic->rq_count; i++)
__netif_napi_del(&enic->napi[i]);
if (vnic_dev_get_intr_mode(enic->vdev) == VNIC_DEV_INTR_MODE_MSIX) for (i = 0; i < enic->wq_count; i++)
__netif_napi_del(&enic->napi[enic_cq_wq(enic, i)]);
/* observe RCU grace period after __netif_napi_del() calls */
synchronize_net();
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.