Skip to content

Commit ca6f65a

Browse files
authored
AMDGPU/GlobalISel: RegBankLegalize rules for transpose loads (#192766)
Adds RegBankLegalize rules for: - amdgcn_global_load_tr4_b64 / tr6_b96 - amdgcn_ds_load_tr4_b64 / tr6_b96 / tr8_b64 / tr16_b128 Extends the existing amdgcn_global_load_tr_b64 / tr_b128 rules with a Uni/Div pointer split, so divergent VGPR pointers select the VADDR form while uniform pointers still select SADDR.
1 parent 0a4d470 commit ca6f65a

3 files changed

Lines changed: 124 additions & 5 deletions

File tree

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1740,12 +1740,33 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
17401740
.Any({{DivV6S32}, {{VgprV6S32}, {IntrId, VgprV32S32, Vgpr32, Vgpr32}}});
17411741

17421742
addRulesForIOpcs({amdgcn_global_load_tr_b64})
1743-
.Any({{DivB64}, {{VgprB64}, {IntrId, SgprP1}}})
1744-
.Any({{DivB32}, {{VgprB32}, {IntrId, SgprP1}}});
1743+
.Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
1744+
.Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
1745+
.Any({{DivB32, _, UniP1}, {{VgprB32}, {IntrId, SgprP1}}})
1746+
.Any({{DivB32, _, DivP1}, {{VgprB32}, {IntrId, VgprP1}}});
17451747

17461748
addRulesForIOpcs({amdgcn_global_load_tr_b128})
1747-
.Any({{DivB64}, {{VgprB64}, {IntrId, SgprP1}}})
1748-
.Any({{DivB128}, {{VgprB128}, {IntrId, SgprP1}}});
1749+
.Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
1750+
.Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
1751+
.Any({{DivB128, _, UniP1}, {{VgprB128}, {IntrId, SgprP1}}})
1752+
.Any({{DivB128, _, DivP1}, {{VgprB128}, {IntrId, VgprP1}}});
1753+
1754+
addRulesForIOpcs({amdgcn_global_load_tr4_b64})
1755+
.Any({{DivV2S32, _, UniP1}, {{VgprV2S32}, {IntrId, SgprP1}}})
1756+
.Any({{DivV2S32, _, DivP1}, {{VgprV2S32}, {IntrId, VgprP1}}});
1757+
1758+
addRulesForIOpcs({amdgcn_global_load_tr6_b96})
1759+
.Any({{DivV3S32, _, UniP1}, {{VgprV3S32}, {IntrId, SgprP1}}})
1760+
.Any({{DivV3S32, _, DivP1}, {{VgprV3S32}, {IntrId, VgprP1}}});
1761+
1762+
addRulesForIOpcs({amdgcn_ds_load_tr4_b64, amdgcn_ds_load_tr8_b64})
1763+
.Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
1764+
1765+
addRulesForIOpcs({amdgcn_ds_load_tr6_b96})
1766+
.Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
1767+
1768+
addRulesForIOpcs({amdgcn_ds_load_tr16_b128})
1769+
.Any({{DivB128}, {{VgprB128}, {IntrId, VgprP3}}});
17491770

17501771
addRulesForIOpcs({amdgcn_global_atomic_ordered_add_b64})
17511772
.Any({{DivS64}, {{Vgpr64}, {IntrId, VgprP1, Vgpr64}}});

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,37 @@ entry:
7474
store <4 x bfloat> %val, ptr addrspace(1) %use
7575
ret void
7676
}
77+
78+
define i32 @global_load_tr_b64_i32_vaddr(ptr addrspace(1) %addr) {
79+
; GFX12-LABEL: global_load_tr_b64_i32_vaddr:
80+
; GFX12: ; %bb.0: ; %entry
81+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
82+
; GFX12-NEXT: s_wait_expcnt 0x0
83+
; GFX12-NEXT: s_wait_samplecnt 0x0
84+
; GFX12-NEXT: s_wait_bvhcnt 0x0
85+
; GFX12-NEXT: s_wait_kmcnt 0x0
86+
; GFX12-NEXT: global_load_tr_b64 v0, v[0:1], off offset:32
87+
; GFX12-NEXT: s_wait_loadcnt 0x0
88+
; GFX12-NEXT: s_setpc_b64 s[30:31]
89+
entry:
90+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
91+
%val = call i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1) %gep)
92+
ret i32 %val
93+
}
94+
95+
define <4 x i16> @global_load_tr_b128_v4i16_vaddr(ptr addrspace(1) %addr) {
96+
; GFX12-LABEL: global_load_tr_b128_v4i16_vaddr:
97+
; GFX12: ; %bb.0: ; %entry
98+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
99+
; GFX12-NEXT: s_wait_expcnt 0x0
100+
; GFX12-NEXT: s_wait_samplecnt 0x0
101+
; GFX12-NEXT: s_wait_bvhcnt 0x0
102+
; GFX12-NEXT: s_wait_kmcnt 0x0
103+
; GFX12-NEXT: global_load_tr_b128 v[0:1], v[0:1], off offset:32
104+
; GFX12-NEXT: s_wait_loadcnt 0x0
105+
; GFX12-NEXT: s_setpc_b64 s[30:31]
106+
entry:
107+
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
108+
%val = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1) %gep)
109+
ret <4 x i16> %val
110+
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
3-
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
3+
; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
44

55
define amdgpu_ps void @global_load_tr4_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
66
; GFX1250-LABEL: global_load_tr4_b64_vaddr:
@@ -213,6 +213,22 @@ entry:
213213
ret void
214214
}
215215

216+
define amdgpu_ps void @ds_load_tr4_b64_saddr(ptr addrspace(3) inreg %addr, ptr addrspace(1) %use) {
217+
; GFX1250-LABEL: ds_load_tr4_b64_saddr:
218+
; GFX1250: ; %bb.0: ; %entry
219+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
220+
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
221+
; GFX1250-NEXT: ds_load_tr4_b64 v[2:3], v2 offset:32
222+
; GFX1250-NEXT: s_wait_dscnt 0x0
223+
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
224+
; GFX1250-NEXT: s_endpgm
225+
entry:
226+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
227+
%val = call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32.p3(ptr addrspace(3) %gep)
228+
store <2 x i32> %val, ptr addrspace(1) %use
229+
ret void
230+
}
231+
216232
define amdgpu_ps void @ds_load_tr8_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
217233
; GFX1250-SDAG-LABEL: ds_load_tr8_b64:
218234
; GFX1250-SDAG: ; %bb.0: ; %entry
@@ -238,6 +254,22 @@ entry:
238254
ret void
239255
}
240256

257+
define amdgpu_ps void @ds_load_tr8_b64_saddr(ptr addrspace(3) inreg %addr, ptr addrspace(1) %use) {
258+
; GFX1250-LABEL: ds_load_tr8_b64_saddr:
259+
; GFX1250: ; %bb.0: ; %entry
260+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
261+
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
262+
; GFX1250-NEXT: ds_load_tr8_b64 v[2:3], v2 offset:32
263+
; GFX1250-NEXT: s_wait_dscnt 0x0
264+
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
265+
; GFX1250-NEXT: s_endpgm
266+
entry:
267+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
268+
%val = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32.p3(ptr addrspace(3) %gep)
269+
store <2 x i32> %val, ptr addrspace(1) %use
270+
ret void
271+
}
272+
241273
define amdgpu_ps void @ds_load_tr6_b96(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
242274
; GFX1250-SDAG-LABEL: ds_load_tr6_b96:
243275
; GFX1250-SDAG: ; %bb.0: ; %entry
@@ -263,6 +295,22 @@ entry:
263295
ret void
264296
}
265297

298+
define amdgpu_ps void @ds_load_tr6_b96_saddr(ptr addrspace(3) inreg %addr, ptr addrspace(1) %use) {
299+
; GFX1250-LABEL: ds_load_tr6_b96_saddr:
300+
; GFX1250: ; %bb.0: ; %entry
301+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
302+
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
303+
; GFX1250-NEXT: ds_load_tr6_b96 v[2:4], v2 offset:32
304+
; GFX1250-NEXT: s_wait_dscnt 0x0
305+
; GFX1250-NEXT: global_store_b96 v[0:1], v[2:4], off
306+
; GFX1250-NEXT: s_endpgm
307+
entry:
308+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
309+
%val = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
310+
store <3 x i32> %val, ptr addrspace(1) %use
311+
ret void
312+
}
313+
266314
define amdgpu_ps void @ds_load_tr16_b128_v8i16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
267315
; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8i16:
268316
; GFX1250-SDAG: ; %bb.0: ; %entry
@@ -288,6 +336,22 @@ entry:
288336
ret void
289337
}
290338

339+
define amdgpu_ps void @ds_load_tr16_b128_v8i16_saddr(ptr addrspace(3) inreg %addr, ptr addrspace(1) %use) {
340+
; GFX1250-LABEL: ds_load_tr16_b128_v8i16_saddr:
341+
; GFX1250: ; %bb.0: ; %entry
342+
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
343+
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
344+
; GFX1250-NEXT: ds_load_tr16_b128 v[2:5], v2 offset:32
345+
; GFX1250-NEXT: s_wait_dscnt 0x0
346+
; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
347+
; GFX1250-NEXT: s_endpgm
348+
entry:
349+
%gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
350+
%val = call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16.p3(ptr addrspace(3) %gep)
351+
store <8 x i16> %val, ptr addrspace(1) %use
352+
ret void
353+
}
354+
291355
define amdgpu_ps void @ds_load_tr16_b128_v8f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
292356
; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8f16:
293357
; GFX1250-SDAG: ; %bb.0: ; %entry

0 commit comments

Comments
 (0)