drm/amdkfd: Handle save/restore of lds allocated in 1280B blocks

The gfx-9 trap handler is reading LDS allocation size in 256 bytes
granularity (from SQ_WAVE_LDS_ALLOC), but it using the assumption that
this value is always even (i.e. the LDS allocation is really done in
multiple of 512 bytes).  This was true so far, but gfx-950 allocates LDS
in chunks of 1280 bytes, making this assumption invalid.  This can cause
the trap handler to try to save / restore past the end of LDS, and past
the LDS allocated slot in the save are, overriding data from the
following wave.

This patch updates the trap handler to support LDS allocated in 1280
bytes blocks:
- During restore, copy from main memory directly to LDS in batch of 1280
  bytes.
- During save, continue to use 512 bytes blocks (we only have 2 VGPRs we
  can use to hold data), making sure to mask the upper half of the wave
  when handling when the LDS size is not a multiple of 512 bytes.

Signed-off-by: Lancelot SIX <lancelot.six@amd.com>
Co-authored-by: Alex Sierra <alex.sierra@amd.com>
Reviewed-by: Jay Cornwall <jay.cornwall@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Lancelot SIX 2024-07-12 23:22:29 +01:00 committed by Alex Deucher
parent 549120edfd
commit 5690011a70
2 changed files with 49 additions and 36 deletions

View File

@ -4124,7 +4124,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
};
static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
0xbf820001, 0xbf8202ea,
0xbf820001, 0xbf8202d8,
0xb8f8f802, 0x8978ff78,
0x00020006, 0xb8fbf803,
0x866eff78, 0x00002000,
@ -4321,9 +4321,9 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
0xe0724300, 0x701d0300,
0xbefe00c1, 0xbeff00c1,
0xb8fb5306, 0x867bc17b,
0xbf840064, 0xbf8a0000,
0xbf840052, 0xbf8a0000,
0x867aff6f, 0x04000000,
0xbf840060, 0x8e7b867b,
0xbf84004e, 0x8e7b867b,
0x8e7b827b, 0xbef6007b,
0xb8f02985, 0x80708170,
0x8e708a70, 0x8e708170,
@ -4336,8 +4336,8 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
0x000204c1, 0x867aff78,
0x00400000, 0xbf850003,
0xb8faf803, 0x897a7aff,
0x10000000, 0xbf850030,
0x24040682, 0xd86e4000,
0x10000000, 0xbf85001d,
0x24040682, 0xd86c0000,
0x00000002, 0xbf8cc07f,
0xbe840080, 0xd2890000,
0x00000900, 0x80048104,
@ -4348,29 +4348,20 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
0x80048104, 0xc069003a,
0x00000070, 0xbf8cc07f,
0x80709070, 0xbf06c004,
0xbf84ffee, 0xbe840080,
0xd2890000, 0x00000901,
0x80048104, 0xd2890001,
0x00000901, 0x80048104,
0xd2890002, 0x00000901,
0x80048104, 0xd2890003,
0x00000901, 0x80048104,
0xc069003a, 0x00000070,
0xbf8cc07f, 0x80709070,
0xbf06c004, 0xbf84ffee,
0x680404ff, 0x00000200,
0xbf84ffee, 0x680404ff,
0x00000100, 0xd0c9006a,
0x0000f702, 0xbf87ffe5,
0xbf820016, 0xd1060002,
0x00011103, 0x7e0602ff,
0x00000200, 0xbefc00ff,
0x00010000, 0xbe800077,
0x8677ff77, 0xff7fffff,
0x8777ff77, 0x00058000,
0xd8ec0000, 0x00000002,
0xbf8cc07f, 0xe0765000,
0x701d0002, 0x68040702,
0xd0c9006a, 0x0000f702,
0xbf87ffd2, 0xbf820015,
0xd1060002, 0x00011103,
0x7e0602ff, 0x00000200,
0xbefc00ff, 0x00010000,
0xbe800077, 0x8677ff77,
0xff7fffff, 0x8777ff77,
0x00058000, 0xd8ec0000,
0x00000002, 0xbf8cc07f,
0xe0765000, 0x701d0002,
0x68040702, 0xd0c9006a,
0x0000f702, 0xbf87fff7,
0xbefe016a, 0xbf87fff6,
0xbef70000, 0xbef000ff,
0x00000400, 0xbefe00c1,
0xbeff00c1, 0xb8fb2b05,
@ -4497,15 +4488,15 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
0x701d0300, 0x807c847c,
0x8070ff70, 0x00000400,
0xbf0a7b7c, 0xbf85ffeb,
0xbf9c0000, 0xbf8200ee,
0xbf9c0000, 0xbf8200f4,
0xbef4007e, 0x8675ff7f,
0x0000ffff, 0x8775ff75,
0x00040000, 0xbef60080,
0xbef700ff, 0x00807fac,
0x866eff7f, 0x04000000,
0xbf84001f, 0xbefe00c1,
0xbf840025, 0xbefe00c1,
0xbeff00c1, 0xb8ef5306,
0x866fc16f, 0xbf84001a,
0x866fc16f, 0xbf840020,
0x8e6f866f, 0x8e6f826f,
0xbef6006f, 0xb8f82985,
0x80788178, 0x8e788a78,
@ -4516,9 +4507,12 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
0x01000000, 0xbefc0080,
0xe0510000, 0x781d0000,
0xe0510100, 0x781d0000,
0x807cff7c, 0x00000200,
0x8078ff78, 0x00000200,
0xbf0a6f7c, 0xbf85fff6,
0xe0510200, 0x781d0000,
0xe0510300, 0x781d0000,
0xe0510400, 0x781d0000,
0x807cff7c, 0x00000500,
0x8078ff78, 0x00000500,
0xbf0a6f7c, 0xbf85fff0,
0xbefe00c1, 0xbeff00c1,
0xbef600ff, 0x01000000,
0xb8ef2b05, 0x806f816f,

View File

@ -75,8 +75,10 @@ var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
#if ASIC_FAMILY >= CHIP_GC_9_5_0
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 11
var LDS_RESTORE_GRANULARITY_BYTES = 1280
#else
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
var LDS_RESTORE_GRANULARITY_BYTES = 512
#endif
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
@ -572,12 +574,21 @@ if SAVE_AFTER_XNACK_ERROR
v_lshlrev_b32 v2, 2, v3
L_SAVE_LDS_LOOP_SQC:
#if ASIC_FAMILY < CHIP_GC_9_5_0
ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40
s_waitcnt lgkmcnt(0)
write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset)
v_add_u32 v2, 0x200, v2
#else
// gfx950 needs to save in multiple of 256 bytes.
ds_read_b32 v0, v2
s_waitcnt lgkmcnt(0)
write_vgprs_to_mem_with_sqc(v0, 1, s_save_buf_rsrc0, s_save_mem_offset)
v_add_u32 v2, 0x100, v2
#endif
v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC
@ -601,6 +612,9 @@ L_SAVE_LDS_LOOP_VECTOR:
// v_add_u32 v2, vcc[0:1], v2, v3
v_add_u32 v2, v2, v3
v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
#if ASIC_FAMILY >= CHIP_GC_9_5_0
s_mov_b64 exec, vcc
#endif
s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
// restore rsrc3
@ -763,8 +777,13 @@ L_RESTORE:
L_RESTORE_LDS_LOOP:
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW
s_add_u32 m0, m0, 256*2 // 128 DW
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW
#if ASIC_FAMILY >= CHIP_GC_9_5_0
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:512 // third 64DW
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:768 // forth 64DW
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:1024 // fifth 64DW
#endif
s_add_u32 m0, m0, LDS_RESTORE_GRANULARITY_BYTES // 128/320 DW
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, LDS_RESTORE_GRANULARITY_BYTES //mem offset increased by 128/320 DW
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?