/* Open up the device */
ret = rndis_filter_open(nvdev); if (ret != 0) {
netdev_err(net, "unable to open device (ret %d).\n", ret); return ret;
}
rdev = nvdev->extension; if (!rdev->link_state) {
netif_carrier_on(net);
netvsc_tx_enable(nvdev, net);
}
if (vf_netdev) { /* Setting synthetic device up transparently sets * slave as up. If open fails, then slave will be * still be offline (and not used).
*/
ret = dev_open(vf_netdev, NULL); if (ret)
netdev_warn(net, "unable to open slave: %s: %d\n",
vf_netdev->name, ret);
} return 0;
}
/* Ensure pending bytes in ring are read */ for (;;) {
u32 aread = 0;
for (i = 0; i < nvdev->num_chn; i++) { struct vmbus_channel *chn
= nvdev->chan_table[i].channel;
if (!chn) continue;
/* make sure receive not running now */
napi_synchronize(&nvdev->chan_table[i].napi);
aread = hv_get_bytes_to_read(&chn->inbound); if (aread) break;
aread = hv_get_bytes_to_read(&chn->outbound); if (aread) break;
}
if (aread == 0) return 0;
if (++retry > RETRY_MAX) return -ETIMEDOUT;
usleep_range(RETRY_US_LO, RETRY_US_HI);
}
}
staticvoid netvsc_tx_disable(struct netvsc_device *nvscdev, struct net_device *ndev)
{ if (nvscdev) {
nvscdev->tx_disable = true;
virt_wmb(); /* ensure txq will not wake up after stop */
}
/* If queue index changed record the new value */ if (q_idx != old_idx &&
sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache))
sk_tx_queue_set(sk, q_idx);
return q_idx;
}
/* * Select queue for transmit. * * If a valid queue has already been assigned, then use that. * Otherwise compute tx queue based on hash and the send table. * * This is basically similar to default (netdev_pick_tx) with the added step * of using the host send_table when no other queue has been assigned. * * TODO support XPS - but get_xps_queue not exported
*/ static u16 netvsc_pick_tx(struct net_device *ndev, struct sk_buff *skb)
{ int q_idx = sk_tx_queue_get(skb->sk);
if (q_idx < 0 || skb->ooo_okay || q_idx >= ndev->real_num_tx_queues) { /* If forwarding a packet, we use the recorded queue when * available for better cache locality.
*/ if (skb_rx_queue_recorded(skb))
q_idx = skb_get_rx_queue(skb); else
q_idx = netvsc_get_tx_queue(ndev, skb, q_idx);
}
/* Record the queue selected by VF so that it can be * used for common case where VF has more queues than * the synthetic device.
*/
qdisc_skb_cb(skb)->slave_dev_queue_mapping = txq;
} else {
txq = netvsc_pick_tx(ndev, skb);
}
rcu_read_unlock();
while (txq >= ndev->real_num_tx_queues)
txq -= ndev->real_num_tx_queues;
/* If VF is present and up then redirect packets to it. * Skip the VF if it is marked down or has no carrier. * If netpoll is in uses, then VF can not be used either.
*/
vf_netdev = rcu_dereference_bh(net_device_ctx->vf_netdev); if (vf_netdev && netif_running(vf_netdev) &&
netif_carrier_ok(vf_netdev) && !netpoll_tx_running(net) &&
net_device_ctx->data_path_is_vf) return netvsc_vf_xmit(net, vf_netdev, skb);
/* We will atmost need two pages to describe the rndis * header. We can only transmit MAX_PAGE_BUFFER_COUNT number * of pages in a single packet. If skb is scattered around * more pages we try linearizing it.
*/
num_data_pgs = netvsc_get_slots(skb) + 2;
if (unlikely(num_data_pgs > MAX_PAGE_BUFFER_COUNT)) {
++net_device_ctx->eth_stats.tx_scattered;
/* * Place the rndis header in the skb head room and * the skb->cb will be used for hv_netvsc_packet * structure.
*/
ret = skb_cow_head(skb, RNDIS_AND_PPI_SIZE); if (ret) goto no_memory;
/* Use the skb control buffer for building up the packet */
BUILD_BUG_ON(sizeof(struct hv_netvsc_packet) >
sizeof_field(struct sk_buff, cb));
packet = (struct hv_netvsc_packet *)skb->cb;
/* When using AF_PACKET we need to drop VLAN header from * the frame and update the SKB to allow the HOST OS * to transmit the 802.1Q packet
*/ if (skb->protocol == htons(ETH_P_8021Q)) {
u16 vlan_tci;
skb_reset_mac_header(skb); if (eth_type_vlan(eth_hdr(skb)->h_proto)) { if (unlikely(__skb_vlan_pop(skb, &vlan_tci) != 0)) {
++net_device_ctx->eth_stats.vlan_error; goto drop;
}
if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
csum_info->transmit.tcp_checksum = 1; else
csum_info->transmit.udp_checksum = 1;
}
} else { /* Can't do offload of this type of checksum */ if (skb_checksum_help(skb)) goto drop;
}
}
/* Start filling in the page buffers with the rndis hdr */
rndis_msg->msg_len += rndis_msg_size;
packet->total_data_buflen = rndis_msg->msg_len;
packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size,
skb, packet, pb);
/* timestamp packet in software */
skb_tx_timestamp(skb);
ret = netvsc_send(net, packet, rndis_msg, pb, skb, xdp_tx); if (likely(ret == 0)) return NETDEV_TX_OK;
if (ret == -EAGAIN) {
++net_device_ctx->eth_stats.tx_busy; return NETDEV_TX_BUSY;
}
if (ret == -ENOSPC)
++net_device_ctx->eth_stats.tx_no_space;
/* Ensure the packet is big enough to access its fields */ if (resp->msg_len - RNDIS_HEADER_SIZE < sizeof(struct rndis_indicate_status)) {
netdev_err(net, "invalid rndis_indicate_status packet, len: %u\n",
resp->msg_len); return;
}
/* Copy the RNDIS indicate status into nvchan->recv_buf */
memcpy(indicate, data + RNDIS_HEADER_SIZE, sizeof(*indicate));
/* Update the physical link speed when changing to another vSwitch */ if (indicate->status == RNDIS_STATUS_LINK_SPEED_CHANGE) {
u32 speed;
/* Validate status_buf_offset and status_buflen. * * Certain (pre-Fe) implementations of Hyper-V's vSwitch didn't account * for the status buffer field in resp->msg_len; perform the validation * using data_buflen (>= resp->msg_len).
*/ if (indicate->status_buflen < sizeof(speed) ||
indicate->status_buf_offset < sizeof(*indicate) ||
data_buflen - RNDIS_HEADER_SIZE < indicate->status_buf_offset ||
data_buflen - RNDIS_HEADER_SIZE - indicate->status_buf_offset
< indicate->status_buflen) {
netdev_err(net, "invalid rndis_indicate_status packet\n"); return;
}
/* Copy to skb. This copy is needed here since the memory * pointed by hv_netvsc_packet cannot be deallocated.
*/ for (i = 0; i < nvchan->rsc.cnt; i++)
skb_put_data(skb, nvchan->rsc.data[i],
nvchan->rsc.len[i]);
}
skb->protocol = eth_type_trans(skb, net);
/* skb is already created with CHECKSUM_NONE */
skb_checksum_none_assert(skb);
/* Incoming packets may have IP header checksum verified by the host. * They may not have IP header checksum computed after coalescing. * We compute it here if the flags are set, because on Linux, the IP * checksum is always checked.
*/ if ((ppi_flags & NVSC_RSC_CSUM_INFO) && csum_info->receive.ip_checksum_value_invalid &&
csum_info->receive.ip_checksum_succeeded &&
skb->protocol == htons(ETH_P_IP)) { /* Check that there is enough space to hold the IP header. */ if (skb_headlen(skb) < sizeof(struct iphdr)) {
kfree_skb(skb); return NULL;
}
netvsc_comp_ipcsum(skb);
}
/* Do L4 checksum offload if enabled and present. */ if ((ppi_flags & NVSC_RSC_CSUM_INFO) && (net->features & NETIF_F_RXCSUM)) { if (csum_info->receive.tcp_checksum_succeeded ||
csum_info->receive.udp_checksum_succeeded)
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
/* Allocate a skb - TODO direct I/O to pages? */
skb = netvsc_alloc_recv_skb(net, nvchan, &xdp);
if (unlikely(!skb)) {
++net_device_ctx->eth_stats.rx_no_memory; return NVSP_STAT_FAIL;
}
skb_record_rx_queue(skb, q_idx);
/* * Even if injecting the packet, record the statistics * on the synthetic device because modifying the VF device * statistics will not work correctly.
*/
u64_stats_update_begin(&rx_stats->syncp); if (act == XDP_TX)
rx_stats->xdp_tx++;
/* We do not support separate count for rx, tx, or other */ if (count == 0 ||
channels->rx_count || channels->tx_count || channels->other_count) return -EINVAL;
if (!nvdev || nvdev->destroy) return -ENODEV;
if (nvdev->nvsp_version < NVSP_PROTOCOL_VERSION_5) return -EINVAL;
if (count > nvdev->max_chn) return -EINVAL;
orig = nvdev->num_chn;
device_info = netvsc_devinfo_get(nvdev);
if (!device_info) return -ENOMEM;
device_info->num_chn = count;
ret = netvsc_detach(net, nvdev); if (ret) goto out;
ret = netvsc_attach(net, device_info); if (ret) {
device_info->num_chn = orig; if (netvsc_attach(net, device_info))
netdev_err(net, "restoring channel setting failed\n");
}
/* Hyper-V RNDIS protocol does not have ring in the HW sense. * It does have pre-allocated receive area which is divided into sections.
*/ staticvoid __netvsc_get_ringparam(struct netvsc_device *nvdev, struct ethtool_ringparam *ring)
{
u32 max_buf_size;
switch (event->event) { /* Only the following events are possible due to the check in * netvsc_linkstatus_callback()
*/ case RNDIS_STATUS_MEDIA_CONNECT: if (rdev->link_state) {
rdev->link_state = false;
netif_carrier_on(net);
netvsc_tx_enable(net_device, net);
} else {
__netdev_notify_peers(net);
}
kfree(event); break; case RNDIS_STATUS_MEDIA_DISCONNECT: if (!rdev->link_state) {
rdev->link_state = true;
netif_carrier_off(net);
netvsc_tx_disable(net_device, net);
}
kfree(event); break; case RNDIS_STATUS_NETWORK_CHANGE: /* Only makes sense if carrier is present */ if (!rdev->link_state) {
rdev->link_state = true;
netif_carrier_off(net);
netvsc_tx_disable(net_device, net);
event->event = RNDIS_STATUS_MEDIA_CONNECT;
spin_lock_irqsave(&ndev_ctx->lock, flags);
list_add(&event->list, &ndev_ctx->reconfig_events);
spin_unlock_irqrestore(&ndev_ctx->lock, flags);
reschedule = true;
} break;
}
rtnl_unlock();
/* link_watch only sends one notification with current state per * second, handle next reconfig event in 2 seconds.
*/ if (reschedule)
schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
staticint netvsc_vf_join(struct net_device *vf_netdev, struct net_device *ndev, int context)
{ struct net_device_context *ndev_ctx = netdev_priv(ndev); int ret;
ret = netdev_rx_handler_register(vf_netdev,
netvsc_vf_handle_frame, ndev); if (ret != 0) {
netdev_err(vf_netdev, "can not register netvsc VF receive handler (err = %d)\n",
ret); goto rx_handler_failed;
}
ret = netdev_master_upper_dev_link(vf_netdev, ndev,
NULL, NULL, NULL); if (ret != 0) {
netdev_err(vf_netdev, "can not set master device %s (err = %d)\n",
ndev->name, ret); goto upper_link_failed;
}
/* If this registration is called from probe context vf_takeover * is taken care of later in probe itself.
*/ if (context == VF_REG_IN_NOTIFIER)
schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
netdev_info(vf_netdev, "joined to %s\n", ndev->name); return 0;
staticvoid __netvsc_vf_setup(struct net_device *ndev, struct net_device *vf_netdev)
{ int ret;
/* Align MTU of VF with master */
ret = dev_set_mtu(vf_netdev, ndev->mtu); if (ret)
netdev_warn(vf_netdev, "unable to change mtu to %u\n", ndev->mtu);
/* set multicast etc flags on VF */
dev_change_flags(vf_netdev, ndev->flags | IFF_SLAVE, NULL);
/* sync address list from ndev to VF */
netif_addr_lock_bh(ndev);
dev_uc_sync(vf_netdev, ndev);
dev_mc_sync(vf_netdev, ndev);
netif_addr_unlock_bh(ndev);
if (netif_running(ndev)) {
ret = dev_open(vf_netdev, NULL); if (ret)
netdev_warn(vf_netdev, "unable to open: %d\n", ret);
}
}
/* Setup VF as slave of the synthetic device. * Runs in workqueue to avoid recursion in netlink callbacks.
*/ staticvoid netvsc_vf_setup(struct work_struct *w)
{ struct net_device_context *ndev_ctx
= container_of(w, struct net_device_context, vf_takeover.work); struct net_device *ndev = hv_get_drvdata(ndev_ctx->device_ctx); struct net_device *vf_netdev;
if (!rtnl_trylock()) {
schedule_delayed_work(&ndev_ctx->vf_takeover, 0); return;
}
vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev); if (vf_netdev)
__netvsc_vf_setup(ndev, vf_netdev);
rtnl_unlock();
}
/* Find netvsc by VF serial number. * The PCI hyperv controller records the serial number as the slot kobj name.
*/ staticstruct net_device *get_netvsc_byslot(conststruct net_device *vf_netdev)
{ struct device *parent = vf_netdev->dev.parent; struct net_device_context *ndev_ctx; struct net_device *ndev; struct pci_dev *pdev;
u32 serial;
if (!parent || !dev_is_pci(parent)) return NULL; /* not a PCI device */
pdev = to_pci_dev(parent); if (!pdev->slot) {
netdev_notice(vf_netdev, "no PCI slot information\n"); return NULL;
}
/* Fallback path to check synthetic vf with help of mac addr. * Because this function can be called before vf_netdev is * initialized (NETDEV_POST_INIT) when its perm_addr has not been copied * from dev_addr, also try to match to its dev_addr. * Note: On Hyper-V and Azure, it's not possible to set a MAC address * on a VF that matches to the MAC of a unrelated NETVSC device.
*/
list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
ndev = hv_get_drvdata(ndev_ctx->device_ctx); if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr) ||
ether_addr_equal(vf_netdev->dev_addr, ndev->perm_addr)) return ndev;
}
netdev_notice(vf_netdev, "no netdev found for vf serial:%u\n", serial); return NULL;
}
ndev = get_netvsc_byslot(vf_netdev); if (!ndev) return NOTIFY_DONE;
/* Set slave flag and no addrconf flag before open * to prevent IPv6 addrconf.
*/
vf_netdev->flags |= IFF_SLAVE;
vf_netdev->priv_flags |= IFF_NO_ADDRCONF; return NOTIFY_DONE;
}
/* if synthetic interface is a different namespace, * then move the VF to that namespace; join will be * done again in that context.
*/ if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) {
ret = dev_change_net_namespace(vf_netdev,
dev_net(ndev), "eth%d"); if (ret)
netdev_err(vf_netdev, "could not move to same namespace as %s: %d\n",
ndev->name, ret); else
netdev_info(vf_netdev, "VF moved to namespace with: %s\n",
ndev->name); return NOTIFY_DONE;
}
/* Change the data path when VF UP/DOWN/CHANGE are detected. * * Typically a UP or DOWN event is followed by a CHANGE event, so * net_device_ctx->data_path_is_vf is used to cache the current data path * to avoid the duplicate call of netvsc_switch_datapath() and the duplicate * message. * * During hibernation, if a VF NIC driver (e.g. mlx5) preserves the network * interface, there is only the CHANGE event and no UP or DOWN event.
*/ staticint netvsc_vf_changed(struct net_device *vf_netdev, unsignedlong event)
{ struct net_device_context *net_device_ctx; struct netvsc_device *netvsc_dev; struct net_device *ndev; bool vf_is_up = false; int ret;
if (event != NETDEV_GOING_DOWN)
vf_is_up = netif_running(vf_netdev);
ndev = get_netvsc_byref(vf_netdev); if (!ndev) return NOTIFY_DONE;
net_device_ctx = netdev_priv(ndev);
netvsc_dev = rtnl_dereference(net_device_ctx->nvdev); if (!netvsc_dev) return NOTIFY_DONE;
if (net_device_ctx->data_path_is_vf == vf_is_up) return NOTIFY_OK;
if (vf_is_up && !net_device_ctx->vf_alloc) {
netdev_info(ndev, "Waiting for the VF association from host\n");
wait_for_completion(&net_device_ctx->vf_add);
}
/* In Azure, when accelerated networking in enabled, other NICs * like MANA, MLX, are configured as a bonded nic with * Netvsc(failover) NIC. For bonded NICs, the min of the max * pkt aggregate size of the members is propagated in the stack. * In order to allow these NICs (MANA/MLX) to use up to * GSO_MAX_SIZE gso packet size, we need to allow Netvsc NIC to * also support this in the guest. * This value is only increased for netvsc NIC when datapath is * switched over to the VF
*/ if (vf_is_up)
netif_set_tso_max_size(ndev, vf_netdev->tso_max_size); else
netif_set_tso_max_size(ndev, netvsc_dev->netvsc_gso_max_size);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.