mirror of
https://github.com/torvalds/linux.git
synced 2026-05-31 02:24:24 +02:00
drm/amdkfd: Handle save/restore of lds allocated in 1280B blocks
The gfx-9 trap handler is reading LDS allocation size in 256 bytes granularity (from SQ_WAVE_LDS_ALLOC), but it using the assumption that this value is always even (i.e. the LDS allocation is really done in multiple of 512 bytes). This was true so far, but gfx-950 allocates LDS in chunks of 1280 bytes, making this assumption invalid. This can cause the trap handler to try to save / restore past the end of LDS, and past the LDS allocated slot in the save are, overriding data from the following wave. This patch updates the trap handler to support LDS allocated in 1280 bytes blocks: - During restore, copy from main memory directly to LDS in batch of 1280 bytes. - During save, continue to use 512 bytes blocks (we only have 2 VGPRs we can use to hold data), making sure to mask the upper half of the wave when handling when the LDS size is not a multiple of 512 bytes. Signed-off-by: Lancelot SIX <lancelot.six@amd.com> Co-authored-by: Alex Sierra <alex.sierra@amd.com> Reviewed-by: Jay Cornwall <jay.cornwall@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
549120edfd
commit
5690011a70
|
|
@ -4124,7 +4124,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
|
|||
};
|
||||
|
||||
static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
|
||||
0xbf820001, 0xbf8202ea,
|
||||
0xbf820001, 0xbf8202d8,
|
||||
0xb8f8f802, 0x8978ff78,
|
||||
0x00020006, 0xb8fbf803,
|
||||
0x866eff78, 0x00002000,
|
||||
|
|
@ -4321,9 +4321,9 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
|
|||
0xe0724300, 0x701d0300,
|
||||
0xbefe00c1, 0xbeff00c1,
|
||||
0xb8fb5306, 0x867bc17b,
|
||||
0xbf840064, 0xbf8a0000,
|
||||
0xbf840052, 0xbf8a0000,
|
||||
0x867aff6f, 0x04000000,
|
||||
0xbf840060, 0x8e7b867b,
|
||||
0xbf84004e, 0x8e7b867b,
|
||||
0x8e7b827b, 0xbef6007b,
|
||||
0xb8f02985, 0x80708170,
|
||||
0x8e708a70, 0x8e708170,
|
||||
|
|
@ -4336,8 +4336,8 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
|
|||
0x000204c1, 0x867aff78,
|
||||
0x00400000, 0xbf850003,
|
||||
0xb8faf803, 0x897a7aff,
|
||||
0x10000000, 0xbf850030,
|
||||
0x24040682, 0xd86e4000,
|
||||
0x10000000, 0xbf85001d,
|
||||
0x24040682, 0xd86c0000,
|
||||
0x00000002, 0xbf8cc07f,
|
||||
0xbe840080, 0xd2890000,
|
||||
0x00000900, 0x80048104,
|
||||
|
|
@ -4348,29 +4348,20 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
|
|||
0x80048104, 0xc069003a,
|
||||
0x00000070, 0xbf8cc07f,
|
||||
0x80709070, 0xbf06c004,
|
||||
0xbf84ffee, 0xbe840080,
|
||||
0xd2890000, 0x00000901,
|
||||
0x80048104, 0xd2890001,
|
||||
0x00000901, 0x80048104,
|
||||
0xd2890002, 0x00000901,
|
||||
0x80048104, 0xd2890003,
|
||||
0x00000901, 0x80048104,
|
||||
0xc069003a, 0x00000070,
|
||||
0xbf8cc07f, 0x80709070,
|
||||
0xbf06c004, 0xbf84ffee,
|
||||
0x680404ff, 0x00000200,
|
||||
0xbf84ffee, 0x680404ff,
|
||||
0x00000100, 0xd0c9006a,
|
||||
0x0000f702, 0xbf87ffe5,
|
||||
0xbf820016, 0xd1060002,
|
||||
0x00011103, 0x7e0602ff,
|
||||
0x00000200, 0xbefc00ff,
|
||||
0x00010000, 0xbe800077,
|
||||
0x8677ff77, 0xff7fffff,
|
||||
0x8777ff77, 0x00058000,
|
||||
0xd8ec0000, 0x00000002,
|
||||
0xbf8cc07f, 0xe0765000,
|
||||
0x701d0002, 0x68040702,
|
||||
0xd0c9006a, 0x0000f702,
|
||||
0xbf87ffd2, 0xbf820015,
|
||||
0xd1060002, 0x00011103,
|
||||
0x7e0602ff, 0x00000200,
|
||||
0xbefc00ff, 0x00010000,
|
||||
0xbe800077, 0x8677ff77,
|
||||
0xff7fffff, 0x8777ff77,
|
||||
0x00058000, 0xd8ec0000,
|
||||
0x00000002, 0xbf8cc07f,
|
||||
0xe0765000, 0x701d0002,
|
||||
0x68040702, 0xd0c9006a,
|
||||
0x0000f702, 0xbf87fff7,
|
||||
0xbefe016a, 0xbf87fff6,
|
||||
0xbef70000, 0xbef000ff,
|
||||
0x00000400, 0xbefe00c1,
|
||||
0xbeff00c1, 0xb8fb2b05,
|
||||
|
|
@ -4497,15 +4488,15 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
|
|||
0x701d0300, 0x807c847c,
|
||||
0x8070ff70, 0x00000400,
|
||||
0xbf0a7b7c, 0xbf85ffeb,
|
||||
0xbf9c0000, 0xbf8200ee,
|
||||
0xbf9c0000, 0xbf8200f4,
|
||||
0xbef4007e, 0x8675ff7f,
|
||||
0x0000ffff, 0x8775ff75,
|
||||
0x00040000, 0xbef60080,
|
||||
0xbef700ff, 0x00807fac,
|
||||
0x866eff7f, 0x04000000,
|
||||
0xbf84001f, 0xbefe00c1,
|
||||
0xbf840025, 0xbefe00c1,
|
||||
0xbeff00c1, 0xb8ef5306,
|
||||
0x866fc16f, 0xbf84001a,
|
||||
0x866fc16f, 0xbf840020,
|
||||
0x8e6f866f, 0x8e6f826f,
|
||||
0xbef6006f, 0xb8f82985,
|
||||
0x80788178, 0x8e788a78,
|
||||
|
|
@ -4516,9 +4507,12 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
|
|||
0x01000000, 0xbefc0080,
|
||||
0xe0510000, 0x781d0000,
|
||||
0xe0510100, 0x781d0000,
|
||||
0x807cff7c, 0x00000200,
|
||||
0x8078ff78, 0x00000200,
|
||||
0xbf0a6f7c, 0xbf85fff6,
|
||||
0xe0510200, 0x781d0000,
|
||||
0xe0510300, 0x781d0000,
|
||||
0xe0510400, 0x781d0000,
|
||||
0x807cff7c, 0x00000500,
|
||||
0x8078ff78, 0x00000500,
|
||||
0xbf0a6f7c, 0xbf85fff0,
|
||||
0xbefe00c1, 0xbeff00c1,
|
||||
0xbef600ff, 0x01000000,
|
||||
0xb8ef2b05, 0x806f816f,
|
||||
|
|
|
|||
|
|
@ -75,8 +75,10 @@ var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000
|
|||
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
|
||||
#if ASIC_FAMILY >= CHIP_GC_9_5_0
|
||||
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 11
|
||||
var LDS_RESTORE_GRANULARITY_BYTES = 1280
|
||||
#else
|
||||
var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
|
||||
var LDS_RESTORE_GRANULARITY_BYTES = 512
|
||||
#endif
|
||||
var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
|
||||
var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
|
||||
|
|
@ -572,12 +574,21 @@ if SAVE_AFTER_XNACK_ERROR
|
|||
|
||||
v_lshlrev_b32 v2, 2, v3
|
||||
L_SAVE_LDS_LOOP_SQC:
|
||||
#if ASIC_FAMILY < CHIP_GC_9_5_0
|
||||
ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40
|
||||
s_waitcnt lgkmcnt(0)
|
||||
|
||||
write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset)
|
||||
|
||||
v_add_u32 v2, 0x200, v2
|
||||
#else
|
||||
// gfx950 needs to save in multiple of 256 bytes.
|
||||
ds_read_b32 v0, v2
|
||||
s_waitcnt lgkmcnt(0)
|
||||
write_vgprs_to_mem_with_sqc(v0, 1, s_save_buf_rsrc0, s_save_mem_offset)
|
||||
|
||||
v_add_u32 v2, 0x100, v2
|
||||
#endif
|
||||
|
||||
v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
|
||||
s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC
|
||||
|
||||
|
|
@ -601,6 +612,9 @@ L_SAVE_LDS_LOOP_VECTOR:
|
|||
// v_add_u32 v2, vcc[0:1], v2, v3
|
||||
v_add_u32 v2, v2, v3
|
||||
v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
|
||||
#if ASIC_FAMILY >= CHIP_GC_9_5_0
|
||||
s_mov_b64 exec, vcc
|
||||
#endif
|
||||
s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
|
||||
|
||||
// restore rsrc3
|
||||
|
|
@ -763,8 +777,13 @@ L_RESTORE:
|
|||
L_RESTORE_LDS_LOOP:
|
||||
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
|
||||
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW
|
||||
s_add_u32 m0, m0, 256*2 // 128 DW
|
||||
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW
|
||||
#if ASIC_FAMILY >= CHIP_GC_9_5_0
|
||||
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:512 // third 64DW
|
||||
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:768 // forth 64DW
|
||||
buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:1024 // fifth 64DW
|
||||
#endif
|
||||
s_add_u32 m0, m0, LDS_RESTORE_GRANULARITY_BYTES // 128/320 DW
|
||||
s_add_u32 s_restore_mem_offset, s_restore_mem_offset, LDS_RESTORE_GRANULARITY_BYTES //mem offset increased by 128/320 DW
|
||||
s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
|
||||
s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete?
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user