/* it sets P4_PEBS_ENABLE_UOP_TAG as well */ #define P4_GEN_PEBS_BIND(name, pebs, vert) \
[P4_PEBS_METRIC__##name] = { \
.metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
.metric_vert = vert, \
}
/* * note we have P4_PEBS_ENABLE_UOP_TAG always set here * * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of * event configuration to find out which values are to be * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT * registers
*/ staticstruct p4_pebs_bind p4_pebs_bind_map[] = {
P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
};
/* * Because of Netburst being quite restricted in how many * identical events may run simultaneously, we introduce event aliases, * ie the different events which have the same functionality but * utilize non-intersected resources (ESCR/CCCR/counter registers). * * This allow us to relax restrictions a bit and run two or more * identical events together. * * Never set any custom internal bits such as P4_CONFIG_HT, * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are * either up to date automatically or not applicable at all.
*/ staticstruct p4_event_alias {
u64 original;
u64 alternative;
} p4_event_aliases[] = {
{ /* * Non-halted cycles can be substituted with non-sleeping cycles (see * Intel SDM Vol3b for details). We need this alias to be able * to run nmi-watchdog and 'perf top' (or any other user space tool * which is interested in running PERF_COUNT_HW_CPU_CYCLES) * simultaneously.
*/
.original =
p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
.alternative =
p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
P4_CCCR_COMPARE),
},
};
static u64 p4_get_alias_event(u64 config)
{
u64 config_match; int i;
/* * Only event with special mark is allowed, * we're to be sure it didn't come as malformed * RAW event.
*/ if (!(config & P4_CONFIG_ALIASABLE)) return 0;
/* * retired instructions * in a sake of simplicity we don't use the FSB tagging
*/
[PERF_COUNT_HW_INSTRUCTIONS] =
p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
/* User data may have out-of-bound event index */
v = p4_config_unpack_event(event->attr.config); if (v >= ARRAY_SIZE(p4_event_bind_map)) return -EINVAL;
/* It may be unsupported: */ if (!p4_event_match_cpu_model(v)) return -EINVAL;
/* * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as * in Architectural Performance Monitoring, it means not * on _which_ logical cpu to count but rather _when_, ie it * depends on logical cpu state -- count event if one cpu active, * none, both or any, so we just allow user to pass any value * desired. * * In turn we always set Tx_OS/Tx_USR bits bound to logical * cpu without their propagation to another cpu
*/
/* * if an event is shared across the logical threads * the user needs special permissions to be able to use it
*/ if (p4_ht_active() && p4_event_bind_map[v].shared) {
v = perf_allow_cpu(); if (v) return v;
}
/* ESCR EventMask bits may be invalid */
emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK; if (emask & ~p4_event_bind_map[v].escr_emask) return -EINVAL;
/* * it may have some invalid PEBS bits
*/ if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) return -EINVAL;
v = p4_config_unpack_metric(event->attr.config); if (v >= ARRAY_SIZE(p4_pebs_bind_map)) return -EINVAL;
return 0;
}
staticint p4_hw_config(struct perf_event *event)
{ int cpu = get_cpu(); int rc = 0;
u32 escr, cccr;
/* * the reason we use cpu that early is that: if we get scheduled * first time on the same cpu -- we will not need swap thread * specific flags in config (and will save some cpu cycles)
*/
if (p4_ht_active() && p4_ht_thread(cpu))
event->hw.config = p4_set_ht_bit(event->hw.config);
if (event->attr.type == PERF_TYPE_RAW) { struct p4_event_bind *bind; unsignedint esel; /* * Clear bits we reserve to be managed by kernel itself * and never allowed from a user space
*/
event->attr.config &= P4_CONFIG_MASK;
rc = p4_validate_raw_event(event); if (rc) goto out;
/* * Note that for RAW events we allow user to use P4_CCCR_RESERVED * bits since we keep additional info here (for cache events and etc)
*/
event->hw.config |= event->attr.config;
bind = p4_config_get_bind(event->attr.config); if (!bind) {
rc = -EINVAL; goto out;
}
esel = P4_OPCODE_ESEL(bind->opcode);
event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
}
/* an official way for overflow indication */
rdmsrq(hwc->config_base, v); if (v & P4_CCCR_OVF) {
wrmsrq(hwc->config_base, v & ~P4_CCCR_OVF); return 1;
}
/* * In some circumstances the overflow might issue an NMI but did * not set P4_CCCR_OVF bit. Because a counter holds a negative value * we simply check for high bit being set, if it's cleared it means * the counter has reached zero value and continued counting before * real NMI signal was received:
*/
rdmsrq(hwc->event_base, v); if (!(v & ARCH_P4_UNFLAGGED_BIT)) return 1;
return 0;
}
staticvoid p4_pmu_disable_pebs(void)
{ /* * FIXME * * It's still allowed that two threads setup same cache * events so we can't simply clear metrics until we knew * no one is depending on us, so we need kind of counter * for "ReplayEvent" users. * * What is more complex -- RAW events, if user (for some * reason) will pass some cache event metric with improper * event opcode -- it's fine from hardware point of view * but completely nonsense from "meaning" of such action. * * So at moment let leave metrics turned on forever -- it's * ok for now but need to be revisited! * * (void)wrmsrq_safe(MSR_IA32_PEBS_ENABLE, 0); * (void)wrmsrq_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
*/
}
/* * If event gets disabled while counter is in overflowed * state we need to clear P4_CCCR_OVF, otherwise interrupt get * asserted again and again
*/
(void)wrmsrq_safe(hwc->config_base,
p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
}
staticvoid p4_pmu_disable_all(void)
{ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx;
/* * - we dont support cascaded counters yet * - and counter 1 is broken (erratum)
*/
WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
WARN_ON_ONCE(hwc->idx == 1);
/* we need a real Event value */
escr_conf &= ~P4_ESCR_EVENT_MASK;
escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
cccr = p4_config_unpack_cccr(hwc->config);
/* * it could be Cache event so we need to write metrics * into additional MSRs
*/
p4_pmu_enable_pebs(hwc->config);
staticint p4_pmu_set_period(struct perf_event *event)
{ struct hw_perf_event *hwc = &event->hw;
s64 left = this_cpu_read(pmc_prev_left[hwc->idx]); int ret;
ret = x86_perf_event_set_period(event);
if (hwc->event_base) { /* * This handles erratum N15 in intel doc 249199-029, * the counter may not be updated correctly on write * so we need a second write operation to do the trick * (the official workaround didn't work) * * the former idea is taken from OProfile code
*/
wrmsrq(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
}
for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { int overflow;
if (!test_bit(idx, cpuc->active_mask)) { /* catch in-flight IRQs */ if (__test_and_clear_bit(idx, per_cpu(p4_running, smp_processor_id())))
handled++; continue;
}
event = cpuc->events[idx];
hwc = &event->hw;
WARN_ON_ONCE(hwc->idx != idx);
/* it might be unflagged overflow */
overflow = p4_pmu_clear_cccr_ovf(hwc);
val = x86_perf_event_update(event); if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) continue;
handled += overflow;
/* event overflow for sure */
perf_sample_data_init(&data, 0, hwc->last_period);
if (!static_call(x86_pmu_set_period)(event)) continue;
perf_event_overflow(event, &data, regs);
}
if (handled)
inc_irq_stat(apic_perf_irqs);
/* * When dealing with the unmasking of the LVTPC on P4 perf hw, it has * been observed that the OVF bit flag has to be cleared first _before_ * the LVTPC can be unmasked. * * The reason is the NMI line will continue to be asserted while the OVF * bit is set. This causes a second NMI to generate if the LVTPC is * unmasked before the OVF bit is cleared, leading to unknown NMI * messages.
*/
apic_write(APIC_LVTPC, APIC_DM_NMI);
return handled;
}
/* * swap thread specific fields according to a thread * we are going to run on
*/ staticvoid p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
{
u32 escr, cccr;
/* * we either lucky and continue on same cpu or no HT support
*/ if (!p4_should_swap_ts(hwc->config, cpu)) return;
/* * the event is migrated from an another logical * cpu, so we need to swap thread specific flags
*/
/* * ESCR address hashing is tricky, ESCRs are not sequential * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and * the metric between any ESCRs is laid in range [0xa0,0xe1] * * so we make ~70% filled hashtable
*/
staticint p4_next_cntr(int thread, unsignedlong *used_mask, struct p4_event_bind *bind)
{ int i, j;
for (i = 0; i < P4_CNTR_LIMIT; i++) {
j = bind->cntr[thread][i]; if (j != -1 && !test_bit(j, used_mask)) return j;
}
return -1;
}
staticint p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{ unsignedlong used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsignedlong escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)]; int cpu = smp_processor_id(); struct hw_perf_event *hwc; struct p4_event_bind *bind; unsignedint i, thread, num; int cntr_idx, escr_idx;
u64 config_alias; int pass;
again: /* * It's possible to hit a circular lock * between original and alternative events * if both are scheduled already.
*/ if (pass > 2) goto done;
if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
cntr_idx = hwc->idx; if (assign)
assign[i] = hwc->idx; goto reserve;
}
cntr_idx = p4_next_cntr(thread, used_mask, bind); if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { /* * Check whether an event alias is still available.
*/
config_alias = p4_get_alias_event(hwc->config); if (!config_alias) goto done;
hwc->config = config_alias;
pass++; goto again;
} /* * Perf does test runs to see if a whole group can be assigned * together successfully. There can be multiple rounds of this. * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config * bits, such that the next round of group assignments will * cause the above p4_should_swap_ts to pass instead of fail. * This leads to counters exclusive to thread0 being used by * thread1. * * Solve this with a cheap hack, reset the idx back to -1 to * force a new lookup (p4_next_cntr) to get the right counter * for the right thread. * * This probably doesn't comply with the general spirit of how * perf wants to work, but P4 is special. :-(
*/ if (p4_should_swap_ts(hwc->config, cpu))
hwc->idx = -1;
p4_pmu_swap_config_ts(hwc, cpu); if (assign)
assign[i] = cntr_idx;
reserve:
set_bit(cntr_idx, used_mask);
set_bit(escr_idx, escr_mask);
}
/* * Even though the counters are configured to interrupt a particular * logical processor when an overflow happens, testing has shown that * on kdump kernels (which uses a single cpu), thread1's counter * continues to run and will report an NMI on thread0. Due to the * overflow bug, this leads to a stream of unknown NMIs. * * Solve this by zero'ing out the registers to mimic a reset.
*/
for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
reg = x86_pmu_config_addr(i);
wrmsrq_safe(reg, 0ULL);
}
return 0;
}
Messung V0.5
¤ Dauer der Verarbeitung: 0.59 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.