diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 7e75ec0994014..7ba9231e38ccd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -1587,6 +1587,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Uni(S64, {{Sgpr64}, {IntrId, Vcc, Sgpr64}}) .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}}); + addRulesForIOpcs({amdgcn_ballot}, Standard) + .Uni(S64, {{Sgpr64}, {IntrId, Vcc}}) + .Uni(S32, {{Sgpr32}, {IntrId, Vcc}}); + addRulesForIOpcs({amdgcn_exp}) .Any({{_, _, _, S32, S32, S32, S32}, {{}, {IntrId, Imm, Imm, Vgpr32, Vgpr32, Vgpr32, Vgpr32}}}); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll index d3e211855d7ed..ad88b6030004b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel < %s | FileCheck -check-prefixes=CHECK,GFX10 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -global-isel < %s | FileCheck -check-prefixes=CHECK,GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=CHECK,GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=CHECK,GFX11 %s declare i32 @llvm.amdgcn.ballot.i32(i1) declare i32 @llvm.ctpop.i32(i32) @@ -21,7 +21,7 @@ define amdgpu_cs i32 @constant_false() { define amdgpu_cs i32 @constant_true() { ; CHECK-LABEL: constant_true: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s0, exec_lo +; CHECK-NEXT: s_and_b32 s0, exec_lo, exec_lo ; CHECK-NEXT: ; return to shader part epilog %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 1) ret i32 %ballot @@ -33,8 +33,7 @@ define amdgpu_cs i32 @non_compare(i32 %x) { ; CHECK-LABEL: non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 ; CHECK-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc) @@ -73,33 +72,28 @@ define amdgpu_cs i32 @compare_floats(float %x, float %y) { ret i32 %ballot } -define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) { -; CHECK-LABEL: ctpop_of_ballot: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 -; CHECK-NEXT: s_bcnt1_i32_b32 s0, vcc_lo -; CHECK-NEXT: ; return to shader part epilog - %cmp = fcmp ogt float %x, %y - %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) - %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot) - ret i32 %bcnt -} +; FIXME: Re-enable once G_CTPOP has RegBankLegalize rules. +; define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) { +; %cmp = fcmp ogt float %x, %y +; %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) +; %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot) +; ret i32 %bcnt +; } define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 +; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB6_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_2: ; %false +; CHECK-NEXT: s_branch .LBB6_3 +; CHECK-NEXT: .LBB6_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: s_branch .LBB6_3 +; CHECK-NEXT: .LBB6_3: %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_ne_zero = icmp ne i32 %ballot, 0 @@ -116,14 +110,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) { ; CHECK-NEXT: s_xor_b32 s0, s0, 1 ; CHECK-NEXT: s_and_b32 s0, s0, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB8_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB8_3 -; CHECK-NEXT: .LBB8_2: ; %false +; CHECK-NEXT: s_branch .LBB7_3 +; CHECK-NEXT: .LBB7_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB8_3 -; CHECK-NEXT: .LBB8_3: +; CHECK-NEXT: s_branch .LBB7_3 +; CHECK-NEXT: .LBB7_3: %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_ne_zero = icmp ne i32 %ballot, 0 @@ -139,15 +133,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo -; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 +; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB8_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_2: ; %true +; CHECK-NEXT: s_branch .LBB8_3 +; CHECK-NEXT: .LBB8_2: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_3: +; CHECK-NEXT: s_branch .LBB8_3 +; CHECK-NEXT: .LBB8_3: %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_eq_zero = icmp eq i32 %ballot, 0 @@ -165,14 +159,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) { ; CHECK-NEXT: s_xor_b32 s0, s0, 1 ; CHECK-NEXT: s_and_b32 s0, s0, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB10_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_2: ; %false +; CHECK-NEXT: s_branch .LBB9_3 +; CHECK-NEXT: .LBB9_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_3: +; CHECK-NEXT: s_branch .LBB9_3 +; CHECK-NEXT: .LBB9_3: %c = trunc i32 %v to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_eq_zero = icmp eq i32 %ballot, 0 @@ -188,14 +182,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB11_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB10_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_2: ; %false +; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_3: +; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_3: %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_ne_zero = icmp ne i32 %ballot, 0 @@ -210,14 +204,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_cmp_ge_u32 s0, 12 -; CHECK-NEXT: s_cbranch_scc1 .LBB12_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB11_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB12_3 -; CHECK-NEXT: .LBB12_2: ; %false +; CHECK-NEXT: s_branch .LBB11_3 +; CHECK-NEXT: .LBB11_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB12_3 -; CHECK-NEXT: .LBB12_3: +; CHECK-NEXT: s_branch .LBB11_3 +; CHECK-NEXT: .LBB11_3: %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_ne_zero = icmp ne i32 %ballot, 0 @@ -233,14 +227,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB13_2 +; CHECK-NEXT: s_cbranch_scc0 .LBB12_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_2: ; %true +; CHECK-NEXT: s_branch .LBB12_3 +; CHECK-NEXT: .LBB12_2: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_3: +; CHECK-NEXT: s_branch .LBB12_3 +; CHECK-NEXT: .LBB12_3: %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_eq_zero = icmp eq i32 %ballot, 0 @@ -255,14 +249,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB13_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_2: ; %false +; CHECK-NEXT: s_branch .LBB13_3 +; CHECK-NEXT: .LBB13_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_3: +; CHECK-NEXT: s_branch .LBB13_3 +; CHECK-NEXT: .LBB13_3: %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %ballot_eq_zero = icmp eq i32 %ballot, 0 @@ -280,14 +274,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 ; CHECK-NEXT: s_cmp_eq_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB15_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_2: ; %false +; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_3: +; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -309,14 +303,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 ; CHECK-NEXT: s_or_b32 s0, s0, s1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB16_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB15_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB16_3 -; CHECK-NEXT: .LBB16_2: ; %false +; CHECK-NEXT: s_branch .LBB15_3 +; CHECK-NEXT: .LBB15_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB16_3 -; CHECK-NEXT: .LBB16_3: +; CHECK-NEXT: s_branch .LBB15_3 +; CHECK-NEXT: .LBB15_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -335,14 +329,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 ; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0 -; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 +; CHECK-NEXT: s_cbranch_scc0 .LBB16_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_2: ; %true +; CHECK-NEXT: s_branch .LBB16_3 +; CHECK-NEXT: .LBB16_2: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_3: +; CHECK-NEXT: s_branch .LBB16_3 +; CHECK-NEXT: .LBB16_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -364,14 +358,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 ; CHECK-NEXT: s_and_b32 s0, s0, s1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB18_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_2: ; %false +; CHECK-NEXT: s_branch .LBB17_3 +; CHECK-NEXT: .LBB17_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_3: +; CHECK-NEXT: s_branch .LBB17_3 +; CHECK-NEXT: .LBB17_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -388,18 +382,17 @@ define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; CHECK-NEXT: s_cselect_b32 s0, exec_lo, 0 +; CHECK-NEXT: s_and_b32 s0, s0, exec_lo ; CHECK-NEXT: s_cmp_le_i32 s0, 22 -; CHECK-NEXT: s_cbranch_scc1 .LBB19_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB18_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB19_3 -; CHECK-NEXT: .LBB19_2: ; %false +; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB19_3 -; CHECK-NEXT: .LBB19_3: +; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_3: %c = icmp ult i32 %v, 12 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %c) %bc = icmp sgt i32 %ballot, 22 @@ -415,9 +408,10 @@ false: define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { ; GFX10-LABEL: non_cst_non_compare_input: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: s_and_b32 s0, s0, 1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0 ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-NEXT: ; %bb.1: ; %B @@ -442,9 +436,10 @@ define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid ; ; GFX11-LABEL: non_cst_non_compare_input: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_and_b32 s0, 1, s0 +; GFX11-NEXT: s_and_b32 s0, s0, 1 ; GFX11-NEXT: s_mov_b32 s1, exec_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, exec_lo, 0 ; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 ; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX11-NEXT: ; %bb.1: ; %B diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll index 250fbc7c0f147..9df5a811b85e6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel -new-reg-bank-select < %s | FileCheck %s declare i64 @llvm.amdgcn.ballot.i64(i1) declare i64 @llvm.ctpop.i64(i64) @@ -9,8 +9,7 @@ declare i64 @llvm.ctpop.i64(i64) define amdgpu_cs i64 @constant_false() { ; CHECK-LABEL: constant_false: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_and_b64 s[0:1], 0, exec ; CHECK-NEXT: ; return to shader part epilog %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 0) ret i64 %ballot @@ -21,8 +20,7 @@ define amdgpu_cs i64 @constant_false() { define amdgpu_cs i64 @constant_true() { ; CHECK-LABEL: constant_true: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s0, exec_lo -; CHECK-NEXT: s_mov_b32 s1, exec_hi +; CHECK-NEXT: s_and_b64 s[0:1], exec, exec ; CHECK-NEXT: ; return to shader part epilog %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 1) ret i64 %ballot @@ -34,8 +32,7 @@ define amdgpu_cs i64 @non_compare(i32 %x) { ; CHECK-LABEL: non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 ; CHECK-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc) @@ -75,34 +72,28 @@ define amdgpu_cs i64 @compare_floats(float %x, float %y) { ret i64 %ballot } -define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) { -; CHECK-LABEL: ctpop_of_ballot: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 -; CHECK-NEXT: s_bcnt1_i32_b64 s0, vcc -; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: ; return to shader part epilog - %cmp = fcmp ogt float %x, %y - %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) - %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot) - ret i64 %bcnt -} +; FIXME: Re-enable once G_CTPOP has RegBankLegalize rules. +; define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) { +; %cmp = fcmp ogt float %x, %y +; %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) +; %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot) +; ret i64 %bcnt +; } define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { ; CHECK-LABEL: branch_divergent_ballot_ne_zero_non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 +; CHECK-NEXT: s_cmp_eq_u64 vcc, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB6_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_2: ; %false +; CHECK-NEXT: s_branch .LBB6_3 +; CHECK-NEXT: .LBB6_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB7_3 -; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: s_branch .LBB6_3 +; CHECK-NEXT: .LBB6_3: %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_ne_zero = icmp ne i64 %ballot, 0 @@ -119,14 +110,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) { ; CHECK-NEXT: s_xor_b32 s0, s0, 1 ; CHECK-NEXT: s_and_b32 s0, s0, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB8_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB8_3 -; CHECK-NEXT: .LBB8_2: ; %false +; CHECK-NEXT: s_branch .LBB7_3 +; CHECK-NEXT: .LBB7_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB8_3 -; CHECK-NEXT: .LBB8_3: +; CHECK-NEXT: s_branch .LBB7_3 +; CHECK-NEXT: .LBB7_3: %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_ne_zero = icmp ne i64 %ballot, 0 @@ -142,15 +133,15 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec -; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 +; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 +; CHECK-NEXT: s_cbranch_scc0 .LBB8_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_2: ; %true +; CHECK-NEXT: s_branch .LBB8_3 +; CHECK-NEXT: .LBB8_2: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB9_3 -; CHECK-NEXT: .LBB9_3: +; CHECK-NEXT: s_branch .LBB8_3 +; CHECK-NEXT: .LBB8_3: %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_eq_zero = icmp eq i64 %ballot, 0 @@ -168,14 +159,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) { ; CHECK-NEXT: s_xor_b32 s0, s0, 1 ; CHECK-NEXT: s_and_b32 s0, s0, 1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB10_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_2: ; %false +; CHECK-NEXT: s_branch .LBB9_3 +; CHECK-NEXT: .LBB9_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB10_3 -; CHECK-NEXT: .LBB10_3: +; CHECK-NEXT: s_branch .LBB9_3 +; CHECK-NEXT: .LBB9_3: %c = trunc i32 %v to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_eq_zero = icmp eq i64 %ballot, 0 @@ -191,14 +182,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 ; CHECK-NEXT: s_cmp_eq_u64 vcc, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB11_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB10_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_2: ; %false +; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB11_3 -; CHECK-NEXT: .LBB11_3: +; CHECK-NEXT: s_branch .LBB10_3 +; CHECK-NEXT: .LBB10_3: %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_ne_zero = icmp ne i64 %ballot, 0 @@ -213,14 +204,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_ne_zero_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_cmp_ge_u32 s0, 12 -; CHECK-NEXT: s_cbranch_scc1 .LBB12_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB11_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB12_3 -; CHECK-NEXT: .LBB12_2: ; %false +; CHECK-NEXT: s_branch .LBB11_3 +; CHECK-NEXT: .LBB11_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB12_3 -; CHECK-NEXT: .LBB12_3: +; CHECK-NEXT: s_branch .LBB11_3 +; CHECK-NEXT: .LBB11_3: %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_ne_zero = icmp ne i64 %ballot, 0 @@ -236,14 +227,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 ; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB13_2 +; CHECK-NEXT: s_cbranch_scc0 .LBB12_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_2: ; %true +; CHECK-NEXT: s_branch .LBB12_3 +; CHECK-NEXT: .LBB12_2: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB13_3 -; CHECK-NEXT: .LBB13_3: +; CHECK-NEXT: s_branch .LBB12_3 +; CHECK-NEXT: .LBB12_3: %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_eq_zero = icmp eq i64 %ballot, 0 @@ -258,14 +249,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_eq_zero_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB13_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_2: ; %false +; CHECK-NEXT: s_branch .LBB13_3 +; CHECK-NEXT: .LBB13_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB14_3 -; CHECK-NEXT: .LBB14_3: +; CHECK-NEXT: s_branch .LBB13_3 +; CHECK-NEXT: .LBB13_3: %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %ballot_eq_zero = icmp eq i64 %ballot, 0 @@ -283,14 +274,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB15_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB14_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_2: ; %false +; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB15_3 -; CHECK-NEXT: .LBB15_3: +; CHECK-NEXT: s_branch .LBB14_3 +; CHECK-NEXT: .LBB14_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -312,14 +303,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 ; CHECK-NEXT: s_or_b32 s0, s0, s1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB16_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB15_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB16_3 -; CHECK-NEXT: .LBB16_2: ; %false +; CHECK-NEXT: s_branch .LBB15_3 +; CHECK-NEXT: .LBB15_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB16_3 -; CHECK-NEXT: .LBB16_3: +; CHECK-NEXT: s_branch .LBB15_3 +; CHECK-NEXT: .LBB15_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -338,14 +329,14 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) { ; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0 ; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1 ; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; CHECK-NEXT: s_cbranch_scc0 .LBB17_2 +; CHECK-NEXT: s_cbranch_scc0 .LBB16_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_2: ; %true +; CHECK-NEXT: s_branch .LBB16_3 +; CHECK-NEXT: .LBB16_2: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB17_3 -; CHECK-NEXT: .LBB17_3: +; CHECK-NEXT: s_branch .LBB16_3 +; CHECK-NEXT: .LBB16_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -367,14 +358,14 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg ; CHECK-NEXT: s_cselect_b32 s1, 1, 0 ; CHECK-NEXT: s_and_b32 s0, s0, s1 ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB18_2 +; CHECK-NEXT: s_cbranch_scc1 .LBB17_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_2: ; %false +; CHECK-NEXT: s_branch .LBB17_3 +; CHECK-NEXT: .LBB17_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB18_3 -; CHECK-NEXT: .LBB18_3: +; CHECK-NEXT: s_branch .LBB17_3 +; CHECK-NEXT: .LBB17_3: %v1c = icmp ult i32 %v1, 12 %v2c = icmp ugt i32 %v2, 34 %c = and i1 %v1c, %v2c @@ -391,18 +382,18 @@ define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) { ; CHECK-LABEL: branch_uniform_ballot_sgt_N_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_cmp_lt_u32 s0, 12 -; CHECK-NEXT: s_cselect_b32 s0, 1, 0 -; CHECK-NEXT: s_and_b32 s0, 1, s0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 -; CHECK-NEXT: v_cmp_le_i64_e64 vcc, s[0:1], 22 -; CHECK-NEXT: s_cbranch_vccnz .LBB19_2 +; CHECK-NEXT: s_cselect_b64 s[0:1], exec, 0 +; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: v_cmp_le_i64_e64 s[0:1], s[0:1], 22 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB18_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 -; CHECK-NEXT: s_branch .LBB19_3 -; CHECK-NEXT: .LBB19_2: ; %false +; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_2: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 -; CHECK-NEXT: s_branch .LBB19_3 -; CHECK-NEXT: .LBB19_3: +; CHECK-NEXT: s_branch .LBB18_3 +; CHECK-NEXT: .LBB18_3: %c = icmp ult i32 %v, 12 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %c) %bc = icmp sgt i64 %ballot, 22 @@ -418,9 +409,10 @@ false: define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { ; CHECK-LABEL: non_cst_non_compare_input: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_and_b32 s0, 1, s0 +; CHECK-NEXT: s_and_b32 s0, s0, 1 +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; CHECK-NEXT: s_cselect_b64 s[0:1], exec, 0 ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; CHECK-NEXT: ; %bb.1: ; %B diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ballot.i64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ballot.i64.mir index aa54b425a4db0..abb74f685e6d7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ballot.i64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.ballot.i64.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s --- name: ballot_sgpr_src @@ -13,10 +12,9 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[COPY]](s32) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) - ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[COPY1]](s1) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[INT]](s64) + ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[COPY]](s32) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[AMDGPU_COPY_VCC_SCC]](s1) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[INTRINSIC_CONVERGENT]](s64) %0:_(s32) = COPY $sgpr0 %1:_(s1) = G_TRUNC %0 %2:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), %1 @@ -34,10 +32,12 @@ body: | ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) - ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[COPY1]](s1) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[INT]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[INTRINSIC_CONVERGENT]](s64) %0:_(s32) = COPY $vgpr0 %1:_(s1) = G_TRUNC %0 %2:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), %1 @@ -57,8 +57,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] - ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) - ; CHECK-NEXT: S_ENDPGM 0, implicit [[INT]](s64) + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.ballot), [[ICMP]](s1) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[INTRINSIC_CONVERGENT]](s64) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s1) = G_ICMP intpred(eq), %0, %1