Skip to content

Commit 1b61537

Browse files
authored
[AMDGPU][InstCombine] Fold unused m0 operand to poison for sendmsg intrinsics (#183755)
Fold the second operand (m0) of llvm.amdgcn.s.sendmsg and llvm.amdgcn.s.sendmsghalt to poison when the message type does not use m0. Only MSG_GS_ALLOC_REQ (message ID 9) actually reads the m0 value. All other message types ignore it, so we can fold the operand to poison, which eliminates unnecessary s_mov_b32 m0, 0 instructions in the generated code. Fixes #183605 - Added InstCombine case for amdgcn_s_sendmsg and amdgcn_s_sendmsghalt intrinsics - Extract message ID using 8-bit mask to handle both pre-GFX11 (4-bit) and GFX11+ (8-bit) encoding - Only preserve m0 operand for ID_GS_ALLOC_REQ
1 parent b751241 commit 1b61537

4 files changed

Lines changed: 297 additions & 0 deletions

File tree

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1473,6 +1473,29 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
14731473
// amdgcn.kill(i1 1) is a no-op
14741474
return IC.eraseInstFromFunction(II);
14751475
}
1476+
case Intrinsic::amdgcn_s_sendmsg:
1477+
case Intrinsic::amdgcn_s_sendmsghalt: {
1478+
// The second operand is copied to m0, but is only actually used for
1479+
// certain message types. For message types that are known to not use m0,
1480+
// fold it to poison.
1481+
using namespace AMDGPU::SendMsg;
1482+
1483+
Value *M0Val = II.getArgOperand(1);
1484+
if (isa<PoisonValue>(M0Val))
1485+
break;
1486+
1487+
auto *MsgImm = cast<ConstantInt>(II.getArgOperand(0));
1488+
uint16_t MsgId, OpId, StreamId;
1489+
decodeMsg(MsgImm->getZExtValue(), MsgId, OpId, StreamId, *ST);
1490+
1491+
if (!msgDoesNotUseM0(MsgId, *ST))
1492+
break;
1493+
1494+
// Drop UB-implying attributes since we're replacing with poison.
1495+
II.dropUBImplyingAttrsAndMetadata();
1496+
IC.replaceOperand(II, 1, PoisonValue::get(M0Val->getType()));
1497+
return nullptr;
1498+
}
14761499
case Intrinsic::amdgcn_update_dpp: {
14771500
Value *Old = II.getArgOperand(0);
14781501

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2481,6 +2481,33 @@ uint64_t encodeMsg(uint64_t MsgId, uint64_t OpId, uint64_t StreamId) {
24812481
return MsgId | (OpId << OP_SHIFT_) | (StreamId << STREAM_ID_SHIFT_);
24822482
}
24832483

2484+
bool msgDoesNotUseM0(int64_t MsgId, const MCSubtargetInfo &STI) {
2485+
// Explicitly list message types that are known to not use m0.
2486+
// This is safer than excluding only GS_ALLOC_REQ, in case new message
2487+
// types are added in the future that do use m0.
2488+
if (isGFX11Plus(STI)) {
2489+
switch (MsgId) {
2490+
case ID_DEALLOC_VGPRS_GFX11Plus:
2491+
return true;
2492+
default:
2493+
break;
2494+
}
2495+
}
2496+
switch (MsgId) {
2497+
case ID_SAVEWAVE:
2498+
case ID_STALL_WAVE_GEN:
2499+
case ID_HALT_WAVES:
2500+
case ID_ORDERED_PS_DONE:
2501+
case ID_EARLY_PRIM_DEALLOC:
2502+
case ID_GET_DOORBELL:
2503+
case ID_GET_DDID:
2504+
case ID_SYSMSG:
2505+
return true;
2506+
default:
2507+
return false;
2508+
}
2509+
}
2510+
24842511
} // namespace SendMsg
24852512

24862513
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1574,6 +1574,9 @@ void decodeMsg(unsigned Val, uint16_t &MsgId, uint16_t &OpId,
15741574
LLVM_READNONE
15751575
uint64_t encodeMsg(uint64_t MsgId, uint64_t OpId, uint64_t StreamId);
15761576

1577+
/// Returns true if the message does not use the m0 operand.
1578+
bool msgDoesNotUseM0(int64_t MsgId, const MCSubtargetInfo &STI);
1579+
15771580
} // namespace SendMsg
15781581

15791582
unsigned getInitialPSInputAddr(const Function &F);
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt < %s -mtriple=amdgcn -passes=instcombine -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
3+
; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=instcombine -S | FileCheck %s --check-prefixes=CHECK,GFX9
4+
; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine -S | FileCheck %s --check-prefixes=CHECK,GFX10
5+
; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=instcombine -S | FileCheck %s --check-prefixes=CHECK,GFX11
6+
7+
; Test that the m0 operand is folded to poison for message types that don't use it.
8+
; For the default target (no mcpu), conflicting encodings (IDs 2, 3) should NOT be folded.
9+
10+
; MSG_INTERRUPT (1) DOES use m0 - should NOT be folded
11+
define void @test_sendmsg_interrupt(i32 %val) {
12+
; CHECK-LABEL: @test_sendmsg_interrupt(
13+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 1, i32 [[VAL:%.*]])
14+
; CHECK-NEXT: ret void
15+
;
16+
call void @llvm.amdgcn.s.sendmsg(i32 1, i32 %val)
17+
ret void
18+
}
19+
20+
; MSG_GS (2) + GS_OP_EMIT (2 << 4) DOES use m0 (pre-GFX11) - should NOT be folded
21+
; On GFX11+ this is MSG_HS_TESSFACTOR which also uses m0 - should NOT be folded
22+
; On default target, this should NOT be folded (conflicting encoding)
23+
define void @test_sendmsg_gs_emit(i32 %val) {
24+
; DEFAULT-LABEL: @test_sendmsg_gs_emit(
25+
; DEFAULT-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[VAL:%.*]])
26+
; DEFAULT-NEXT: ret void
27+
;
28+
; GFX9-LABEL: @test_sendmsg_gs_emit(
29+
; GFX9-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[VAL:%.*]])
30+
; GFX9-NEXT: ret void
31+
;
32+
; GFX10-LABEL: @test_sendmsg_gs_emit(
33+
; GFX10-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[VAL:%.*]])
34+
; GFX10-NEXT: ret void
35+
;
36+
; GFX11-LABEL: @test_sendmsg_gs_emit(
37+
; GFX11-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 34, i32 [[VAL:%.*]])
38+
; GFX11-NEXT: ret void
39+
;
40+
call void @llvm.amdgcn.s.sendmsg(i32 34, i32 %val)
41+
ret void
42+
}
43+
44+
; MSG_GS_DONE (3) DOES use m0 (pre-GFX11) - should NOT be folded
45+
; On GFX11+ this is ID_DEALLOC_VGPRS which doesn't use m0
46+
; On default target, this should NOT be folded (conflicting encoding)
47+
define void @test_sendmsg_gs_done(i32 %val) {
48+
; DEFAULT-LABEL: @test_sendmsg_gs_done(
49+
; DEFAULT-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 3, i32 [[VAL:%.*]])
50+
; DEFAULT-NEXT: ret void
51+
;
52+
; GFX9-LABEL: @test_sendmsg_gs_done(
53+
; GFX9-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 3, i32 [[VAL:%.*]])
54+
; GFX9-NEXT: ret void
55+
;
56+
; GFX10-LABEL: @test_sendmsg_gs_done(
57+
; GFX10-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 3, i32 [[VAL:%.*]])
58+
; GFX10-NEXT: ret void
59+
;
60+
; GFX11-LABEL: @test_sendmsg_gs_done(
61+
; GFX11-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 3, i32 poison)
62+
; GFX11-NEXT: ret void
63+
;
64+
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %val)
65+
ret void
66+
}
67+
68+
; MSG_SAVEWAVE (4) doesn't use m0 (GFX8-GFX10)
69+
define void @test_sendmsg_savewave(i32 %val) {
70+
; CHECK-LABEL: @test_sendmsg_savewave(
71+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 4, i32 poison)
72+
; CHECK-NEXT: ret void
73+
;
74+
call void @llvm.amdgcn.s.sendmsg(i32 4, i32 %val)
75+
ret void
76+
}
77+
78+
; MSG_STALL_WAVE_GEN (5) doesn't use m0 (GFX9-GFX11)
79+
define void @test_sendmsg_stall_wave_gen(i32 %val) {
80+
; CHECK-LABEL: @test_sendmsg_stall_wave_gen(
81+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 5, i32 poison)
82+
; CHECK-NEXT: ret void
83+
;
84+
call void @llvm.amdgcn.s.sendmsg(i32 5, i32 %val)
85+
ret void
86+
}
87+
88+
; MSG_HALT_WAVES (6) doesn't use m0 (GFX9-GFX11)
89+
define void @test_sendmsg_halt_waves(i32 %val) {
90+
; CHECK-LABEL: @test_sendmsg_halt_waves(
91+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 6, i32 poison)
92+
; CHECK-NEXT: ret void
93+
;
94+
call void @llvm.amdgcn.s.sendmsg(i32 6, i32 %val)
95+
ret void
96+
}
97+
98+
; MSG_ORDERED_PS_DONE (7) doesn't use m0 (GFX9-GFX10)
99+
define void @test_sendmsg_ordered_ps_done(i32 %val) {
100+
; CHECK-LABEL: @test_sendmsg_ordered_ps_done(
101+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 7, i32 poison)
102+
; CHECK-NEXT: ret void
103+
;
104+
call void @llvm.amdgcn.s.sendmsg(i32 7, i32 %val)
105+
ret void
106+
}
107+
108+
; MSG_EARLY_PRIM_DEALLOC (8) doesn't use m0 (GFX9 only)
109+
define void @test_sendmsg_early_prim_dealloc(i32 %val) {
110+
; CHECK-LABEL: @test_sendmsg_early_prim_dealloc(
111+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 8, i32 poison)
112+
; CHECK-NEXT: ret void
113+
;
114+
call void @llvm.amdgcn.s.sendmsg(i32 8, i32 %val)
115+
ret void
116+
}
117+
118+
; MSG_GS_ALLOC_REQ (9) DOES use m0 - should NOT be folded
119+
define void @test_sendmsg_gs_alloc_req(i32 %val) {
120+
; CHECK-LABEL: @test_sendmsg_gs_alloc_req(
121+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 9, i32 [[VAL:%.*]])
122+
; CHECK-NEXT: ret void
123+
;
124+
call void @llvm.amdgcn.s.sendmsg(i32 9, i32 %val)
125+
ret void
126+
}
127+
128+
; MSG_GET_DOORBELL (10) doesn't use m0 (GFX9-GFX10)
129+
define void @test_sendmsg_get_doorbell(i32 %val) {
130+
; CHECK-LABEL: @test_sendmsg_get_doorbell(
131+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 10, i32 poison)
132+
; CHECK-NEXT: ret void
133+
;
134+
call void @llvm.amdgcn.s.sendmsg(i32 10, i32 %val)
135+
ret void
136+
}
137+
138+
; MSG_GET_DDID (11) doesn't use m0 (GFX10 only)
139+
define void @test_sendmsg_get_ddid(i32 %val) {
140+
; CHECK-LABEL: @test_sendmsg_get_ddid(
141+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 11, i32 poison)
142+
; CHECK-NEXT: ret void
143+
;
144+
call void @llvm.amdgcn.s.sendmsg(i32 11, i32 %val)
145+
ret void
146+
}
147+
148+
; MSG_SYSMSG (15) doesn't use m0
149+
define void @test_sendmsg_sysmsg(i32 %val) {
150+
; CHECK-LABEL: @test_sendmsg_sysmsg(
151+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 15, i32 poison)
152+
; CHECK-NEXT: ret void
153+
;
154+
call void @llvm.amdgcn.s.sendmsg(i32 15, i32 %val)
155+
ret void
156+
}
157+
158+
; Test sendmsghalt as well - MSG_INTERRUPT (1) DOES use m0 - should NOT be folded
159+
define void @test_sendmsghalt_interrupt(i32 %val) {
160+
; CHECK-LABEL: @test_sendmsghalt_interrupt(
161+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 [[VAL:%.*]])
162+
; CHECK-NEXT: ret void
163+
;
164+
call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %val)
165+
ret void
166+
}
167+
168+
; Test sendmsghalt - MSG_GS_ALLOC_REQ (9) DOES use m0 - should NOT be folded
169+
define void @test_sendmsghalt_gs_alloc_req(i32 %val) {
170+
; CHECK-LABEL: @test_sendmsghalt_gs_alloc_req(
171+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsghalt(i32 9, i32 [[VAL:%.*]])
172+
; CHECK-NEXT: ret void
173+
;
174+
call void @llvm.amdgcn.s.sendmsghalt(i32 9, i32 %val)
175+
ret void
176+
}
177+
178+
; m0 already poison - should be a no-op
179+
define void @test_sendmsg_already_poison() {
180+
; CHECK-LABEL: @test_sendmsg_already_poison(
181+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 1, i32 poison)
182+
; CHECK-NEXT: ret void
183+
;
184+
call void @llvm.amdgcn.s.sendmsg(i32 1, i32 poison)
185+
ret void
186+
}
187+
188+
; Test that noundef attribute is dropped when folding to poison
189+
; Using MSG_SAVEWAVE (4) which doesn't use m0
190+
define void @test_sendmsg_noundef(i32 noundef %val) {
191+
; CHECK-LABEL: @test_sendmsg_noundef(
192+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 4, i32 poison)
193+
; CHECK-NEXT: ret void
194+
;
195+
call void @llvm.amdgcn.s.sendmsg(i32 4, i32 noundef %val)
196+
ret void
197+
}
198+
199+
; Test unknown message ID - should NOT be folded (future-proofing)
200+
define void @test_sendmsg_unknown_id(i32 %val) {
201+
; CHECK-LABEL: @test_sendmsg_unknown_id(
202+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 14, i32 [[VAL:%.*]])
203+
; CHECK-NEXT: ret void
204+
;
205+
call void @llvm.amdgcn.s.sendmsg(i32 14, i32 %val)
206+
ret void
207+
}
208+
209+
; Test MSG_HS_TESSFACTOR (2) on GFX11+ - DOES use m0, should NOT be folded
210+
; On pre-GFX11, this is MSG_GS which also uses m0
211+
define void @test_sendmsg_hs_tessfactor(i32 %val) {
212+
; CHECK-LABEL: @test_sendmsg_hs_tessfactor(
213+
; CHECK-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 2, i32 [[VAL:%.*]])
214+
; CHECK-NEXT: ret void
215+
;
216+
call void @llvm.amdgcn.s.sendmsg(i32 2, i32 %val)
217+
ret void
218+
}
219+
220+
; Test MSG_DEALLOC_VGPRS (3) on GFX11+ - doesn't use m0
221+
; On default target, this should NOT be folded (conflicting encoding with MSG_GS_DONE)
222+
define void @test_sendmsg_dealloc_vgprs(i32 %val) {
223+
; DEFAULT-LABEL: @test_sendmsg_dealloc_vgprs(
224+
; DEFAULT-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 3, i32 [[VAL:%.*]])
225+
; DEFAULT-NEXT: ret void
226+
;
227+
; GFX9-LABEL: @test_sendmsg_dealloc_vgprs(
228+
; GFX9-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 3, i32 [[VAL:%.*]])
229+
; GFX9-NEXT: ret void
230+
;
231+
; GFX10-LABEL: @test_sendmsg_dealloc_vgprs(
232+
; GFX10-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 3, i32 [[VAL:%.*]])
233+
; GFX10-NEXT: ret void
234+
;
235+
; GFX11-LABEL: @test_sendmsg_dealloc_vgprs(
236+
; GFX11-NEXT: call void @llvm.amdgcn.s.sendmsg(i32 3, i32 poison)
237+
; GFX11-NEXT: ret void
238+
;
239+
call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %val)
240+
ret void
241+
}
242+
243+
declare void @llvm.amdgcn.s.sendmsg(i32 immarg, i32)
244+
declare void @llvm.amdgcn.s.sendmsghalt(i32 immarg, i32)

0 commit comments

Comments
 (0)