/* * MI300 has a fixed, model-specific mapping between a UMC instance and * its related Data Fabric Coherent Station instance. * * The MCA_IPID_UMC[InstanceId] field holds a unique identifier for the * UMC instance within a Node. Use this to find the appropriate Coherent * Station ID. * * Redundant bits were removed from the map below.
*/ staticconst u16 umc_coh_st_map[32] = {
0x393, 0x293, 0x193, 0x093,
0x392, 0x292, 0x192, 0x092,
0x391, 0x291, 0x191, 0x091,
0x390, 0x290, 0x190, 0x090,
0x793, 0x693, 0x593, 0x493,
0x792, 0x692, 0x592, 0x492,
0x791, 0x691, 0x591, 0x491,
0x790, 0x690, 0x590, 0x490,
};
/* * Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used * for hashing. * * Also, read UMC::CH::Addr{Cfg,Sel,Sel2} and UMC::CH:ColSelLo registers to * get the values needed to reconstruct the normalized address. Apply additional * offsets to the raw register values, as needed. * * Do this during module init, since the values will not change during run time. * * These registers are instantiated for each UMC across each AMD Node. * However, they should be identically programmed due to the fixed hardware * design of MI300 systems. So read the values from Node 0 UMC 0 and keep a * single global structure for simplicity.
*/ int get_umc_info_mi300(void)
{
u32 temp; int ret;
u8 i;
for (i = 0; i < NUM_BANK_BITS; i++) {
ret = amd_smn_read(0, MI300_ADDR_HASH_BANK0 + (i * 4), &temp); if (ret) return ret;
ret = amd_smn_read(0, MI300_ADDR_SEL_2, &temp); if (ret) return ret;
/* Use BankBit5 for the SID1 position. */
bit_shifts.sid[1] = 5 + FIELD_GET(ADDR_SEL_2_BANK5, temp);
bit_shifts.pc = 5 + FIELD_GET(ADDR_SEL_2_CHAN, temp);
return 0;
}
/* * MI300 systems report a DRAM address in MCA_ADDR for DRAM ECC errors. This must * be converted to the intermediate normalized address (NA) before translating to a * system physical address. * * The DRAM address includes bank, row, and column. Also included are bits for * pseudochannel (PC) and stack ID (SID). * * Abbreviations: (S)tack ID, (P)seudochannel, (R)ow, (B)ank, (C)olumn, (Z)ero * * The MCA address format is as follows: * MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]} * * Additionally, the PC and Bank bits may be hashed. This must be accounted for before * reconstructing the normalized address.
*/ #define MI300_UMC_MCA_BANK GENMASK(9, 6) #define MI300_UMC_MCA_ROW GENMASK(24, 10) #define MI300_UMC_MCA_PC BIT(25) #define MI300_UMC_MCA_SID GENMASK(27, 26)
staticunsignedlong convert_dram_to_norm_addr_mi300(unsignedlong addr)
{
u16 i, col, row, bank, pc, sid;
u32 temp;
col = FIELD_GET(MI300_UMC_MCA_COL, addr);
bank = FIELD_GET(MI300_UMC_MCA_BANK, addr);
row = FIELD_GET(MI300_UMC_MCA_ROW, addr);
pc = FIELD_GET(MI300_UMC_MCA_PC, addr);
sid = FIELD_GET(MI300_UMC_MCA_SID, addr);
/* Calculate hash for each Bank bit. */ for (i = 0; i < NUM_BANK_BITS; i++) { if (!addr_hash.bank[i].xor_enable) continue;
/* * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire * all memory within that DRAM row. This applies to the memory with a DRAM * bank. * * To find the memory addresses, loop through permutations of the DRAM column * bits and find the System Physical address of each. The column bits are used * to calculate the intermediate Normalized address, so all permutations should * be checked. * * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
*/ #define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL)) staticvoid _retire_row_mi300(struct atl_err *a_err)
{ unsignedlong addr; struct page *p;
u8 col;
for (col = 0; col < MI300_NUM_COL; col++) {
a_err->addr &= ~MI300_UMC_MCA_COL;
a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
addr = amd_convert_umc_mca_addr_to_sys_addr(a_err); if (IS_ERR_VALUE(addr)) continue;
addr = PHYS_PFN(addr);
/* * Skip invalid or already poisoned pages to avoid unnecessary * error messages from memory_failure().
*/
p = pfn_to_online_page(addr); if (!p) continue;
if (PageHWPoison(p)) continue;
memory_failure(addr, 0);
}
}
/* * In addition to the column bits, the row[13] bit should also be included when * calculating addresses affected by a physical row. * * Instead of running through another loop over a single bit, just run through * the column bits twice and flip the row[13] bit in-between. * * See MI300_UMC_MCA_ROW for the row bits in MCA_ADDR_UMC value.
*/ staticvoid retire_row_mi300(struct atl_err *a_err)
{
_retire_row_mi300(a_err);
a_err->addr ^= MI300_UMC_MCA_ROW13;
_retire_row_mi300(a_err);
}
#define MCA_IPID_INST_ID_HI GENMASK_ULL(47, 44) static u8 get_die_id(struct atl_err *err)
{ /* * AMD Node ID is provided in MCA_IPID[InstanceIdHi], and this * needs to be divided by 4 to get the internal Die ID.
*/ if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) {
u8 node_id = FIELD_GET(MCA_IPID_INST_ID_HI, err->ipid);
return node_id >> 2;
}
/* * For CPUs, this is the AMD Node ID modulo the number * of AMD Nodes per socket.
*/ return topology_amd_node_id(err->cpu) % topology_amd_nodes_per_pkg();
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.