/* * We can't use an array for xlt_emergency_page because dma_map_single doesn't * work on kernel modules memory
*/ void *xlt_emergency_page; static DEFINE_MUTEX(xlt_emergency_page_mutex);
result = MLX5_MKEY_MASK_LEN |
MLX5_MKEY_MASK_PAGE_SIZE |
MLX5_MKEY_MASK_START_ADDR; if (MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5))
result |= MLX5_MKEY_MASK_PAGE_SIZE_5;
void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
{ if (dev->umrc.state == MLX5_UMR_STATE_UNINIT) return;
mutex_destroy(&dev->umrc.lock); /* After device init, UMR cp/qp are not unset during the lifetime. */
ib_destroy_qp(dev->umrc.qp);
ib_free_cq(dev->umrc.cq);
}
int mlx5r_umr_init(struct mlx5_ib_dev *dev)
{ struct ib_pd *pd;
pd = ib_alloc_pd(&dev->ib_dev, 0); if (IS_ERR(pd)) {
mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); return PTR_ERR(pd);
}
dev->umrc.pd = pd;
mutex_init(&dev->umrc.init_lock);
return 0;
}
void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev)
{ if (!dev->umrc.pd) return;
mutex_lock(&umrc->lock); /* Preventing any further WRs to be sent now */ if (umrc->state != MLX5_UMR_STATE_RECOVER) {
mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n",
umrc->state);
umrc->state = MLX5_UMR_STATE_RECOVER;
}
mutex_unlock(&umrc->lock);
/* Sending a final/barrier WR (the failed one) and wait for its completion. * This will ensure that all the previous WRs got a completion before * we set the QP state to RESET.
*/
err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe,
with_data); if (err) {
mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err); goto err;
}
/* Since the QP is in an error state, it will only receive * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier * we don't care about its status.
*/
wait_for_completion(&umr_context->done);
attr.qp_state = IB_QPS_RESET;
err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); if (err) {
mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err); goto err;
}
err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); if (err) {
mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err); goto err;
}
if (umr_context.status == IB_WC_WR_FLUSH_ERR) continue;
WARN_ON_ONCE(1);
mlx5_ib_warn(dev, "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
umr_context.status, mkey);
err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data); if (err)
mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
err);
err = -EFAULT; break;
}
up(&umrc->sem); return err;
}
/** * mlx5r_umr_revoke_mr - Fence all DMA on the MR * @mr: The MR to fence * * Upon return the NIC will not be doing any DMA to the pages under the MR, * and any DMA in progress will be completed. Failure of this function * indicates the HW has failed catastrophically.
*/ int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr)
{ struct mlx5_ib_dev *dev = mr_to_mdev(mr); struct mlx5r_umr_wqe wqe = {};
if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) return 0;
/* * Allocate a temporary buffer to hold the per-page information to transfer to * HW. For efficiency this should be as large as it can be, but buffer * allocation failure is not allowed, so try smaller sizes.
*/ staticvoid *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
{ const size_t xlt_chunk_align = MLX5_UMR_FLEX_ALIGNMENT / ent_size;
size_t size; void *res = NULL;
/* * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the * allocation can't trigger any kind of reclaim.
*/
might_sleep();
gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
/* * If the system already has a suitable high order page then just use * that, but don't try hard to create one. This max is about 1M, so a * free x86 huge page will satisfy it.
*/
size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
MLX5_MAX_UMR_CHUNK);
*nents = size / ent_size;
res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
get_order(size)); if (res) return res;
if (size > MLX5_SPARE_UMR_CHUNK) {
size = MLX5_SPARE_UMR_CHUNK;
*nents = size / ent_size;
res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
get_order(size)); if (res) return res;
}
*nents = PAGE_SIZE / ent_size;
res = (void *)__get_free_page(gfp_mask); if (res) return res;
mlx5r_umr_set_update_xlt_ctrl_seg(&wqe->ctrl_seg, flags, sg);
mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe->mkey_seg, mr, page_shift); if (dd) /* Use the data direct internal kernel PD */
MLX5_SET(mkc, &wqe->mkey_seg, pd, dev->ddr.pdn);
mlx5r_umr_set_update_xlt_data_seg(&wqe->data_seg, sg);
}
/* * Send the DMA list to the HW for a normal MR using UMR. * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP * flag may be used.
*/ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsignedint flags)
{ return mlx5r_umr_update_mr_pas_range(mr, flags, 0, 0);
}
if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
!umr_can_use_indirect_mkey(dev)) return -EPERM;
if (WARN_ON(!mr->umem->is_odp)) return -EINVAL;
/* UMR copies MTTs in units of MLX5_UMR_FLEX_ALIGNMENT bytes, * so we need to align the offset and length accordingly
*/ if (idx & page_mask) {
npages += idx & page_mask;
idx &= ~page_mask;
}
pages_to_map = ALIGN(npages, page_align);
for (pages_mapped = 0;
pages_mapped < pages_to_map && !err;
pages_mapped += pages_iter, idx += pages_iter) {
npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
size_to_map = npages * desc_size;
dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
DMA_TO_DEVICE); /* * npages is the maximum number of pages to map, but we * can't guarantee that all pages are actually mapped. * * For example, if page is p2p of type which is not supported * for mapping, the number of pages mapped will be less than * requested.
*/
err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); if (err) return err;
dma_sync_single_for_device(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
/* * Update only the page-size (log_page_size) field of an existing memory key * using UMR. This is useful when the MR's physical layout stays the same * but the optimal page shift has changed (e.g. dmabuf after pages are * pinned and the HW can switch from 4K to huge-page alignment).
*/ int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr, unsignedint page_shift, bool dd)
{ struct mlx5_ib_dev *dev = mr_to_mdev(mr); struct mlx5r_umr_wqe wqe = {}; int err;
/* Build UMR wqe: we touch only PAGE_SIZE, so use the dedicated mask */
wqe.ctrl_seg.mkey_mask = get_umr_update_translation_mask(dev);
/* MR must be free while page size is modified */
wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE | MLX5_UMR_INLINE;
/* Fill mkey segment with the new page size, keep the rest unchanged */
MLX5_SET(mkc, &wqe.mkey_seg, log_page_size, page_shift);
/** * This function makes an mkey non-present by zapping the translation entries of * the mkey by zapping (zeroing out) the first N entries, where N is determined * by the largest page size supported by the device and the MR length. * It then updates the mkey's page size to the largest possible value, ensuring * the MR is completely non-present and safe for further updates. * It is useful to update the page size of a dmabuf MR on a page fault. * * Return: On success, returns the number of entries that were zapped. * On error, returns a negative error code.
*/ staticint _mlx5r_umr_zap_mkey(struct mlx5_ib_mr *mr, unsignedint flags, unsignedint page_shift,
size_t *nblocks, bool dd)
{ unsignedint old_page_shift = mr->page_shift; struct mlx5_ib_dev *dev = mr_to_mdev(mr); unsignedint max_page_shift;
size_t page_shift_nblocks; unsignedint max_log_size; int access_mode; int err;
access_mode = dd ? MLX5_MKC_ACCESS_MODE_KSM : MLX5_MKC_ACCESS_MODE_MTT;
flags |= MLX5_IB_UPD_XLT_KEEP_PGSZ | MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ATOMIC;
max_log_size = get_max_log_entity_size_cap(dev, access_mode);
max_page_shift = order_base_2(mr->ibmr.length);
max_page_shift = min(max(max_page_shift, page_shift), max_log_size); /* Count blocks in units of max_page_shift, we will zap exactly this * many to make the whole MR non-present. * Block size must be aligned to MLX5_UMR_FLEX_ALIGNMENT since it may * be used as offset into the XLT later on.
*/
*nblocks = ib_umem_num_dma_blocks(mr->umem, 1UL << max_page_shift); if (dd)
*nblocks = ALIGN(*nblocks, MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT); else
*nblocks = ALIGN(*nblocks, MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT);
page_shift_nblocks = ib_umem_num_dma_blocks(mr->umem,
1UL << page_shift); /* If the number of blocks at max possible page shift is greater than * the number of blocks at the new page size, we should just go over the * whole mkey entries.
*/ if (*nblocks >= page_shift_nblocks)
*nblocks = 0;
/* Make the first nblocks entries non-present without changing * page size yet.
*/ if (*nblocks)
mr->page_shift = max_page_shift;
err = _mlx5r_dmabuf_umr_update_pas(mr, flags, 0, *nblocks, dd); if (err) {
mr->page_shift = old_page_shift; return err;
}
/* Change page size to the max page size now that the MR is completely * non-present.
*/ if (*nblocks) {
err = mlx5r_umr_update_mr_page_shift(mr, max_page_shift, dd); if (err) {
mr->page_shift = old_page_shift; return err;
}
}
return 0;
}
/** * mlx5r_umr_dmabuf_update_pgsz - Safely update DMABUF MR page size and its * entries accordingly * @mr: The memory region to update * @xlt_flags: Translation table update flags * @page_shift: The new (optimized) page shift to use * * This function updates the page size and mkey translation entries for a DMABUF * MR in a safe, multi-step process to avoid exposing partially updated mappings * The update is performed in 5 steps: * 1. Make the first X entries non-present, while X is calculated to be * minimal according to a large page shift that can be used to cover the * MR length. * 2. Update the page size to the large supported page size * 3. Load the remaining N-X entries according to the (optimized) page_shift * 4. Update the page size according to the (optimized) page_shift * 5. Load the first X entries with the correct translations * * This ensures that at no point is the MR accessible with a partially updated * translation table, maintaining correctness and preventing access to stale or * inconsistent mappings. * * Returns 0 on success or a negative error code on failure.
*/ int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags, unsignedint page_shift)
{ unsignedint old_page_shift = mr->page_shift;
size_t zapped_blocks;
size_t total_blocks; int err;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.