struct pe_data { struct mm_struct *mm; /* callback to trigger when a translation fault occurs */ void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr); /* opaque pointer to be passed to the above callback */ void *xsl_err_data; struct rcu_head rcu; struct ocxl_link *link; struct mmu_notifier mmu_notifier;
};
struct spa { struct ocxl_process_element *spa_mem; int spa_order; struct mutex spa_lock; struct radix_tree_root pe_tree; /* Maps PE handles to pe_data */ char *irq_name; int virq; void __iomem *reg_dsisr; void __iomem *reg_dar; void __iomem *reg_tfc; void __iomem *reg_pe_handle; /* * The following field are used by the memory fault * interrupt handler. We can only have one interrupt at a * time. The NPU won't raise another interrupt until the * previous one has been ack'd by writing to the TFC register
*/ struct xsl_fault { struct work_struct fault_work;
u64 pe;
u64 dsisr;
u64 dar; struct pe_data pe_data;
} xsl_fault;
};
/* * A opencapi link can be used be by several PCI functions. We have * one link per device slot. * * A linked list of opencapi links should suffice, as there's a * limited number of opencapi slots on a system and lookup is only * done when the device is probed
*/ struct ocxl_link { struct list_head list; struct kref ref; int domain; int bus; int dev; void __iomem *arva; /* ATSD register virtual address */
spinlock_t atsd_lock; /* to serialize shootdowns */
atomic_t irq_available; struct spa *spa; void *platform_data;
}; static LIST_HEAD(links_list); static DEFINE_MUTEX(links_list_lock);
/* * We must release a reference on mm_users whenever exiting this * function (taken in the memory fault interrupt handler)
*/
rc = copro_handle_mm_fault(fault->pe_data.mm, fault->dar, fault->dsisr,
&flt); if (rc) {
pr_debug("copro_handle_mm_fault failed: %d\n", rc); if (fault->pe_data.xsl_err_cb) {
fault->pe_data.xsl_err_cb(
fault->pe_data.xsl_err_data,
fault->dar, fault->dsisr);
}
r = ADDRESS_ERROR; goto ack;
}
if (!radix_enabled()) { /* * update_mmu_cache() will not have loaded the hash * since current->trap is not a 0x400 or 0x300, so * just call hash_page_mm() here.
*/
access = _PAGE_PRESENT | _PAGE_READ; if (fault->dsisr & SPA_XSL_S)
access |= _PAGE_WRITE;
if (get_region_id(fault->dar) != USER_REGION_ID)
access |= _PAGE_PRIVILEGED;
WARN_ON(pe_handle > SPA_PE_MASK);
pe = spa->spa_mem + pe_handle;
pid = be32_to_cpu(pe->pid); /* We could be reading all null values here if the PE is being * removed while an interrupt kicks in. It's not supposed to * happen if the driver notified the AFU to terminate the * PASID, and the AFU waited for pending operations before * acknowledging. But even if it happens, we won't find a * memory context below and fail silently, so it should be ok.
*/ if (!(dsisr & SPA_XSL_TF)) {
WARN(1, "Invalid xsl interrupt fault register %#llx\n", dsisr);
ack_irq(spa, ADDRESS_ERROR); return IRQ_HANDLED;
}
rcu_read_lock();
pe_data = radix_tree_lookup(&spa->pe_tree, pe_handle); if (!pe_data) { /* * Could only happen if the driver didn't notify the * AFU about PASID termination before removing the PE, * or the AFU didn't wait for all memory access to * have completed. * * Either way, we fail early, but we shouldn't log an * error message, as it is a valid (if unexpected) * scenario
*/
rcu_read_unlock();
pr_debug("Unknown mm context for xsl interrupt\n");
ack_irq(spa, ADDRESS_ERROR); return IRQ_HANDLED;
}
if (!pe_data->mm) { /* * translation fault from a kernel context - an OpenCAPI * device tried to access a bad kernel address
*/
rcu_read_unlock();
pr_warn("Unresolved OpenCAPI xsl fault in kernel context\n");
ack_irq(spa, ADDRESS_ERROR); return IRQ_HANDLED;
}
WARN_ON(pe_data->mm->context.id != pid);
if (mmget_not_zero(pe_data->mm)) {
spa->xsl_fault.pe = pe_handle;
spa->xsl_fault.dar = dar;
spa->xsl_fault.dsisr = dsisr;
spa->xsl_fault.pe_data = *pe_data;
schedule = true; /* mm_users count released by bottom half */
}
rcu_read_unlock(); if (schedule)
schedule_work(&spa->xsl_fault.fault_work); else
ack_irq(spa, ADDRESS_ERROR); return IRQ_HANDLED;
}
staticint setup_xsl_irq(struct pci_dev *dev, struct ocxl_link *link)
{ struct spa *spa = link->spa; int rc; int hwirq;
rc = pnv_ocxl_get_xsl_irq(dev, &hwirq); if (rc) return rc;
rc = map_irq_registers(dev, spa); if (rc) return rc;
spa->irq_name = kasprintf(GFP_KERNEL, "ocxl-xsl-%x-%x-%x",
link->domain, link->bus, link->dev); if (!spa->irq_name) {
dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n");
rc = -ENOMEM; goto err_xsl;
} /* * At some point, we'll need to look into allowing a higher * number of interrupts. Could we have an IRQ domain per link?
*/
spa->virq = irq_create_mapping(NULL, hwirq); if (!spa->virq) {
dev_err(&dev->dev, "irq_create_mapping failed for translation interrupt\n");
rc = -EINVAL; goto err_name;
}
dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n", hwirq, spa->virq);
rc = setup_xsl_irq(dev, link); if (rc) goto err_spa;
/* platform specific hook */
rc = pnv_ocxl_spa_setup(dev, link->spa->spa_mem, PE_mask,
&link->platform_data); if (rc) goto err_xsl_irq;
/* if link->arva is not defeined, MMIO registers are not used to * generate TLB invalidate. PowerBus snooping is enabled. * Otherwise, PowerBus snooping is disabled. TLB Invalidates are * initiated using MMIO registers.
*/
pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0, &link->arva);
state = SPA_CFG_DR; if (mfspr(SPRN_LPCR) & LPCR_TC)
state |= SPA_CFG_TC; if (radix_enabled())
state |= SPA_CFG_XLAT_ror; else
state |= SPA_CFG_XLAT_hpt;
state |= SPA_CFG_HV; if (kernel) { if (mfmsr() & MSR_SF)
state |= SPA_CFG_SF;
} else {
state |= SPA_CFG_PR; if (!test_tsk_thread_flag(current, TIF_32BIT))
state |= SPA_CFG_SF;
} return state;
}
/* * For user contexts, register a copro so that TLBIs are seen * by the nest MMU. If we have a kernel context, TLBIs are * already global.
*/ if (mm) {
mm_context_add_copro(mm); if (link->arva) { /* Use MMIO registers for the TLB Invalidate * operations.
*/
trace_ocxl_init_mmu_notifier(pasid, mm->context.id);
mmu_notifier_register(&pe_data->mmu_notifier, mm);
}
}
/* * Barrier is to make sure PE is visible in the SPA before it * is used by the device. It also helps with the global TLBI * invalidation
*/
mb();
radix_tree_insert(&spa->pe_tree, pe_handle, pe_data);
/* * The mm must stay valid for as long as the device uses it. We * lower the count when the context is removed from the SPA. * * We grab mm_count (and not mm_users), as we don't want to * end up in a circular dependency if a process mmaps its * mmio, therefore incrementing the file ref count when * calling mmap(), and forgets to unmap before exiting. In * that scenario, when the kernel handles the death of the * process, the file is not cleaned because unmap was not * called, and the mm wouldn't be freed because we would still * have a reference on mm_users. Incrementing mm_count solves * the problem.
*/ if (mm)
mmgrab(mm);
trace_ocxl_context_add(current->pid, spa->spa_mem, pasid, pidr, tidr);
unlock:
mutex_unlock(&spa->spa_lock); return rc;
}
EXPORT_SYMBOL_GPL(ocxl_link_add_pe);
int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid)
{ struct ocxl_link *link = link_handle; struct spa *spa = link->spa; struct ocxl_process_element *pe; int pe_handle, rc;
if (pasid > SPA_PASID_MAX) return -EINVAL;
pe_handle = pasid & SPA_PE_MASK;
pe = spa->spa_mem + pe_handle;
mutex_lock(&spa->spa_lock);
pe->tid = cpu_to_be32(tid);
/* * The barrier makes sure the PE is updated * before we clear the NPU context cache below, so that the * old PE cannot be reloaded erroneously.
*/
mb();
/* * hook to platform code * On powerpc, the entry needs to be cleared from the context * cache of the NPU.
*/
rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
WARN_ON(rc);
mutex_unlock(&spa->spa_lock); return rc;
}
int ocxl_link_remove_pe(void *link_handle, int pasid)
{ struct ocxl_link *link = link_handle; struct spa *spa = link->spa; struct ocxl_process_element *pe; struct pe_data *pe_data; int pe_handle, rc;
if (pasid > SPA_PASID_MAX) return -EINVAL;
/* * About synchronization with our memory fault handler: * * Before removing the PE, the driver is supposed to have * notified the AFU, which should have cleaned up and make * sure the PASID is no longer in use, including pending * interrupts. However, there's no way to be sure... * * We clear the PE and remove the context from our radix * tree. From that point on, any new interrupt for that * context will fail silently, which is ok. As mentioned * above, that's not expected, but it could happen if the * driver or AFU didn't do the right thing. * * There could still be a bottom half running, but we don't * need to wait/flush, as it is managing a reference count on * the mm it reads from the radix tree.
*/
pe_handle = pasid & SPA_PE_MASK;
pe = spa->spa_mem + pe_handle;
memset(pe, 0, sizeof(struct ocxl_process_element)); /* * The barrier makes sure the PE is removed from the SPA * before we clear the NPU context cache below, so that the * old PE cannot be reloaded erroneously.
*/
mb();
/* * hook to platform code * On powerpc, the entry needs to be cleared from the context * cache of the NPU.
*/
rc = pnv_ocxl_spa_remove_pe_from_cache(link->platform_data, pe_handle);
WARN_ON(rc);
pe_data = radix_tree_delete(&spa->pe_tree, pe_handle); if (!pe_data) {
WARN(1, "Couldn't find pe data when removing PE\n");
} else { if (pe_data->mm) { if (link->arva) {
trace_ocxl_release_mmu_notifier(pasid,
pe_data->mm->context.id);
mmu_notifier_unregister(&pe_data->mmu_notifier,
pe_data->mm);
spin_lock(&link->atsd_lock);
pnv_ocxl_tlb_invalidate(link->arva,
pe_data->mm->context.id,
0ull,
PAGE_SIZE);
spin_unlock(&link->atsd_lock);
}
mm_context_remove_copro(pe_data->mm);
mmdrop(pe_data->mm);
}
kfree_rcu(pe_data, rcu);
}
unlock:
mutex_unlock(&spa->spa_lock); return rc;
}
EXPORT_SYMBOL_GPL(ocxl_link_remove_pe);
int ocxl_link_irq_alloc(void *link_handle, int *hw_irq)
{ struct ocxl_link *link = link_handle; int irq;
if (atomic_dec_if_positive(&link->irq_available) < 0) return -ENOSPC;
irq = xive_native_alloc_irq(); if (!irq) {
atomic_inc(&link->irq_available); return -ENXIO;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.