[RISCV] Functional llvm.vector.reduce.mul on scalable types#193094
[RISCV] Functional llvm.vector.reduce.mul on scalable types#193094
Conversation
RVV does not have an instruction for performing a horizontal multiply reduction (either integer or floating point). However, a user of clang can explicitly write at least the integer form via the __builtin_reduce_mul construct, and currently we just crash when compiling this. This change converts the crash into functionally correct scalar loop to process each element one by one at runtime. This will be slow, but at least correct. Note that to my knowledge we can't generate the floating point one directly from C, but I decided to handle both for completeness while I was here.
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesRVV does not have an instruction for performing a horizontal multiply reduction (either integer or floating point). However, a user of clang can explicitly write at least the integer form via the __builtin_reduce_mul construct, and currently we just crash when compiling this. This change converts the crash into functionally correct scalar loop to process each element one by one at runtime. This will be slow, but at least correct. Note that to my knowledge we can't generate the floating point one directly from C, but I decided to handle both for completeness while I was here. Written by Claude Code with guidance and review by me. Patch is 27.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/193094.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index ccba9ee16885b..b8367ccadd9d9 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -491,6 +491,12 @@ LLVM_ABI Value *getOrderedReduction(IRBuilderBase &Builder, Value *Acc,
Value *Src, unsigned Op,
RecurKind MinMaxKind = RecurKind::None);
+/// Expand a scalable vector reduction into a runtime loop that applies
+/// \p RdxOpcode element by element, starting from \p Acc as the initial
+/// accumulator value (typically the reduction identity).
+LLVM_ABI Value *expandReductionViaLoop(IRBuilderBase &Builder, Value *Vec,
+ unsigned RdxOpcode, Value *Acc);
+
/// Generates a vector reduction using shufflevectors to reduce the value.
/// Fast-math-flags are propagated using the IRBuilder's setting.
LLVM_ABI Value *getShuffleReduction(IRBuilderBase &Builder, Value *Src,
diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp
index f4a07e1988747..376807944a6d7 100644
--- a/llvm/lib/CodeGen/ExpandReductions.cpp
+++ b/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -75,6 +75,10 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
Value *Acc = II->getArgOperand(0);
Value *Vec = II->getArgOperand(1);
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
+ if (isa<ScalableVectorType>(Vec->getType())) {
+ Rdx = expandReductionViaLoop(Builder, Vec, RdxOpcode, Acc);
+ break;
+ }
if (!FMF.allowReassoc())
Rdx = getOrderedReduction(Builder, Acc, Vec, RdxOpcode, RK);
else {
@@ -125,10 +129,16 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
case Intrinsic::vector_reduce_umax:
case Intrinsic::vector_reduce_umin: {
Value *Vec = II->getArgOperand(0);
+ unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
+ if (isa<ScalableVectorType>(Vec->getType())) {
+ Type *EltTy = Vec->getType()->getScalarType();
+ Value *Ident = getReductionIdentity(ID, EltTy, FMF);
+ Rdx = expandReductionViaLoop(Builder, Vec, RdxOpcode, Ident);
+ break;
+ }
if (!isPowerOf2_32(
cast<FixedVectorType>(Vec->getType())->getNumElements()))
continue;
- unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
break;
}
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 62c3b477bb9b8..1e317c8e2265a 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1325,6 +1325,48 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
return Result;
}
+Value *llvm::expandReductionViaLoop(IRBuilderBase &Builder, Value *Vec,
+ unsigned RdxOpcode, Value *Acc) {
+ auto *VTy = cast<VectorType>(Vec->getType());
+ Type *EltTy = VTy->getElementType();
+ Function *F = Builder.GetInsertBlock()->getParent();
+
+ const DataLayout &DL = F->getDataLayout();
+ Type *IdxTy = DL.getIndexType(EltTy->getContext(), 0);
+ unsigned MinElts = VTy->getElementCount().getKnownMinValue();
+ Value *NumElts = Builder.CreateVScale(IdxTy);
+ NumElts = Builder.CreateMul(NumElts, ConstantInt::get(IdxTy, MinElts));
+
+ BasicBlock *EntryBB = Builder.GetInsertBlock();
+ BasicBlock *LoopBB = BasicBlock::Create(F->getContext(), "rdx.loop", F);
+ BasicBlock *ExitBB =
+ EntryBB->splitBasicBlock(Builder.GetInsertPoint(), "rdx.exit");
+
+ EntryBB->getTerminator()->eraseFromParent();
+ Builder.SetInsertPoint(EntryBB);
+ Builder.CreateBr(LoopBB);
+
+ Builder.SetInsertPoint(LoopBB);
+ PHINode *IV = Builder.CreatePHI(IdxTy, 2, "rdx.iv");
+ PHINode *AccPhi = Builder.CreatePHI(EltTy, 2, "rdx.acc");
+ IV->addIncoming(ConstantInt::get(IdxTy, 0), EntryBB);
+ AccPhi->addIncoming(Acc, EntryBB);
+
+ Value *Elt = Builder.CreateExtractElement(Vec, IV);
+ Value *Res = Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,
+ AccPhi, Elt, "rdx.op");
+
+ Value *NextIV = Builder.CreateNUWAdd(IV, ConstantInt::get(IdxTy, 1), "rdx.next");
+ IV->addIncoming(NextIV, LoopBB);
+ AccPhi->addIncoming(Res, LoopBB);
+
+ Value *Done = Builder.CreateICmpEQ(NextIV, NumElts, "rdx.done");
+ Builder.CreateCondBr(Done, ExitBB, LoopBB);
+
+ Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+ return Res;
+}
+
// Helper to generate a log2 shuffle reduction.
Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
unsigned Op,
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 70150d59e729c..a46d039dcf250 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -953,3 +953,143 @@ define half @vreduce_fmax_nxv12f16(<vscale x 12 x half> %v) {
%red = call half @llvm.vector.reduce.fmax.nxv12f16(<vscale x 12 x half> %v)
ret half %red
}
+
+define float @vreduce_fmul_nxv1f32(<vscale x 1 x float> %v, float %s) {
+; CHECK-LABEL: vreduce_fmul_nxv1f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: .LBB75_1: # %rdx.loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vslidedown.vx v9, v8, a0
+; CHECK-NEXT: vfmv.f.s fa5, v9
+; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: fmul.s fa0, fa0, fa5
+; CHECK-NEXT: bne a0, a1, .LBB75_1
+; CHECK-NEXT: # %bb.2: # %rdx.exit
+; CHECK-NEXT: ret
+ %red = call reassoc float @llvm.vector.reduce.fmul.nxv1f32(float %s, <vscale x 1 x float> %v)
+ ret float %red
+}
+
+define float @vreduce_ord_fmul_nxv1f32(<vscale x 1 x float> %v, float %s) {
+; CHECK-LABEL: vreduce_ord_fmul_nxv1f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: .LBB76_1: # %rdx.loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vslidedown.vx v9, v8, a0
+; CHECK-NEXT: vfmv.f.s fa5, v9
+; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: fmul.s fa0, fa0, fa5
+; CHECK-NEXT: bne a0, a1, .LBB76_1
+; CHECK-NEXT: # %bb.2: # %rdx.exit
+; CHECK-NEXT: ret
+ %red = call float @llvm.vector.reduce.fmul.nxv1f32(float %s, <vscale x 1 x float> %v)
+ ret float %red
+}
+
+define float @vreduce_fmul_nxv2f32(<vscale x 2 x float> %v, float %s) {
+; CHECK-LABEL: vreduce_fmul_nxv2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT: .LBB77_1: # %rdx.loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vslidedown.vx v9, v8, a0
+; CHECK-NEXT: vfmv.f.s fa5, v9
+; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: fmul.s fa0, fa0, fa5
+; CHECK-NEXT: bne a0, a1, .LBB77_1
+; CHECK-NEXT: # %bb.2: # %rdx.exit
+; CHECK-NEXT: ret
+ %red = call reassoc float @llvm.vector.reduce.fmul.nxv2f32(float %s, <vscale x 2 x float> %v)
+ ret float %red
+}
+
+define float @vreduce_ord_fmul_nxv2f32(<vscale x 2 x float> %v, float %s) {
+; CHECK-LABEL: vreduce_ord_fmul_nxv2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT: .LBB78_1: # %rdx.loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vslidedown.vx v9, v8, a0
+; CHECK-NEXT: vfmv.f.s fa5, v9
+; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: fmul.s fa0, fa0, fa5
+; CHECK-NEXT: bne a0, a1, .LBB78_1
+; CHECK-NEXT: # %bb.2: # %rdx.exit
+; CHECK-NEXT: ret
+ %red = call float @llvm.vector.reduce.fmul.nxv2f32(float %s, <vscale x 2 x float> %v)
+ ret float %red
+}
+
+define float @vreduce_fmul_nxv4f32(<vscale x 4 x float> %v, float %s) {
+; CHECK-LABEL: vreduce_fmul_nxv4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 1
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: .LBB79_1: # %rdx.loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vslidedown.vx v10, v8, a0
+; CHECK-NEXT: vfmv.f.s fa5, v10
+; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: fmul.s fa0, fa0, fa5
+; CHECK-NEXT: bne a0, a1, .LBB79_1
+; CHECK-NEXT: # %bb.2: # %rdx.exit
+; CHECK-NEXT: ret
+ %red = call reassoc float @llvm.vector.reduce.fmul.nxv4f32(float %s, <vscale x 4 x float> %v)
+ ret float %red
+}
+
+define double @vreduce_fmul_nxv1f64(<vscale x 1 x double> %v, double %s) {
+; CHECK-LABEL: vreduce_fmul_nxv1f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT: .LBB80_1: # %rdx.loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vslidedown.vx v9, v8, a0
+; CHECK-NEXT: vfmv.f.s fa5, v9
+; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: fmul.d fa0, fa0, fa5
+; CHECK-NEXT: bne a0, a1, .LBB80_1
+; CHECK-NEXT: # %bb.2: # %rdx.exit
+; CHECK-NEXT: ret
+ %red = call reassoc double @llvm.vector.reduce.fmul.nxv1f64(double %s, <vscale x 1 x double> %v)
+ ret double %red
+}
+
+define double @vreduce_ord_fmul_nxv1f64(<vscale x 1 x double> %v, double %s) {
+; CHECK-LABEL: vreduce_ord_fmul_nxv1f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NEXT: .LBB81_1: # %rdx.loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vslidedown.vx v9, v8, a0
+; CHECK-NEXT: vfmv.f.s fa5, v9
+; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: fmul.d fa0, fa0, fa5
+; CHECK-NEXT: bne a0, a1, .LBB81_1
+; CHECK-NEXT: # %bb.2: # %rdx.exit
+; CHECK-NEXT: ret
+ %red = call double @llvm.vector.reduce.fmul.nxv1f64(double %s, <vscale x 1 x double> %v)
+ ret double %red
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll
index d575b6c69dc3b..267d9b557a785 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+m,+v \
; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+m,+v \
; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64
define signext i8 @vreduce_add_nxv1i8(<vscale x 1 x i8> %v) {
@@ -1701,3 +1701,510 @@ define i64 @vreduce_xor_nxv4i64(<vscale x 4 x i64> %v) {
%red = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v)
ret i64 %red
}
+
+define signext i8 @vreduce_mul_nxv1i8(<vscale x 1 x i8> %v) {
+; RV32-LABEL: vreduce_mul_nxv1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: li a0, 0
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: srli a2, a2, 3
+; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT: .LBB114_1: # %rdx.loop
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: vslidedown.vx v9, v8, a0
+; RV32-NEXT: vmv.x.s a3, v9
+; RV32-NEXT: addi a0, a0, 1
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: bne a0, a2, .LBB114_1
+; RV32-NEXT: # %bb.2: # %rdx.exit
+; RV32-NEXT: slli a0, a1, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_mul_nxv1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: li a0, 0
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: srli a2, a2, 3
+; RV64-NEXT: li a1, 1
+; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT: .LBB114_1: # %rdx.loop
+; RV64-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT: vslidedown.vx v9, v8, a0
+; RV64-NEXT: vmv.x.s a3, v9
+; RV64-NEXT: addi a0, a0, 1
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: bne a0, a2, .LBB114_1
+; RV64-NEXT: # %bb.2: # %rdx.exit
+; RV64-NEXT: slli a0, a1, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
+ %red = call i8 @llvm.vector.reduce.mul.nxv1i8(<vscale x 1 x i8> %v)
+ ret i8 %red
+}
+
+define signext i8 @vreduce_mul_nxv2i8(<vscale x 2 x i8> %v) {
+; RV32-LABEL: vreduce_mul_nxv2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: li a0, 0
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: srli a2, a2, 2
+; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV32-NEXT: .LBB115_1: # %rdx.loop
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: vslidedown.vx v9, v8, a0
+; RV32-NEXT: vmv.x.s a3, v9
+; RV32-NEXT: addi a0, a0, 1
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: bne a0, a2, .LBB115_1
+; RV32-NEXT: # %bb.2: # %rdx.exit
+; RV32-NEXT: slli a0, a1, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_mul_nxv2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: li a0, 0
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: srli a2, a2, 2
+; RV64-NEXT: li a1, 1
+; RV64-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64-NEXT: .LBB115_1: # %rdx.loop
+; RV64-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT: vslidedown.vx v9, v8, a0
+; RV64-NEXT: vmv.x.s a3, v9
+; RV64-NEXT: addi a0, a0, 1
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: bne a0, a2, .LBB115_1
+; RV64-NEXT: # %bb.2: # %rdx.exit
+; RV64-NEXT: slli a0, a1, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
+ %red = call i8 @llvm.vector.reduce.mul.nxv2i8(<vscale x 2 x i8> %v)
+ ret i8 %red
+}
+
+define signext i8 @vreduce_mul_nxv4i8(<vscale x 4 x i8> %v) {
+; RV32-LABEL: vreduce_mul_nxv4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: li a0, 0
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: srli a2, a2, 1
+; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
+; RV32-NEXT: .LBB116_1: # %rdx.loop
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: vslidedown.vx v9, v8, a0
+; RV32-NEXT: vmv.x.s a3, v9
+; RV32-NEXT: addi a0, a0, 1
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: bne a0, a2, .LBB116_1
+; RV32-NEXT: # %bb.2: # %rdx.exit
+; RV32-NEXT: slli a0, a1, 24
+; RV32-NEXT: srai a0, a0, 24
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_mul_nxv4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: li a0, 0
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: srli a2, a2, 1
+; RV64-NEXT: li a1, 1
+; RV64-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
+; RV64-NEXT: .LBB116_1: # %rdx.loop
+; RV64-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT: vslidedown.vx v9, v8, a0
+; RV64-NEXT: vmv.x.s a3, v9
+; RV64-NEXT: addi a0, a0, 1
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: bne a0, a2, .LBB116_1
+; RV64-NEXT: # %bb.2: # %rdx.exit
+; RV64-NEXT: slli a0, a1, 56
+; RV64-NEXT: srai a0, a0, 56
+; RV64-NEXT: ret
+ %red = call i8 @llvm.vector.reduce.mul.nxv4i8(<vscale x 4 x i8> %v)
+ ret i8 %red
+}
+
+define signext i16 @vreduce_mul_nxv1i16(<vscale x 1 x i16> %v) {
+; RV32-LABEL: vreduce_mul_nxv1i16:
+; RV32: # %bb.0:
+; RV32-NEXT: li a0, 0
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: srli a2, a2, 3
+; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT: .LBB117_1: # %rdx.loop
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: vslidedown.vx v9, v8, a0
+; RV32-NEXT: vmv.x.s a3, v9
+; RV32-NEXT: addi a0, a0, 1
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: bne a0, a2, .LBB117_1
+; RV32-NEXT: # %bb.2: # %rdx.exit
+; RV32-NEXT: slli a0, a1, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_mul_nxv1i16:
+; RV64: # %bb.0:
+; RV64-NEXT: li a0, 0
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: srli a2, a2, 3
+; RV64-NEXT: li a1, 1
+; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT: .LBB117_1: # %rdx.loop
+; RV64-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT: vslidedown.vx v9, v8, a0
+; RV64-NEXT: vmv.x.s a3, v9
+; RV64-NEXT: addi a0, a0, 1
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: bne a0, a2, .LBB117_1
+; RV64-NEXT: # %bb.2: # %rdx.exit
+; RV64-NEXT: slli a0, a1, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
+ %red = call i16 @llvm.vector.reduce.mul.nxv1i16(<vscale x 1 x i16> %v)
+ ret i16 %red
+}
+
+define signext i16 @vreduce_mul_nxv2i16(<vscale x 2 x i16> %v) {
+; RV32-LABEL: vreduce_mul_nxv2i16:
+; RV32: # %bb.0:
+; RV32-NEXT: li a0, 0
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: srli a2, a2, 2
+; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV32-NEXT: .LBB118_1: # %rdx.loop
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: vslidedown.vx v9, v8, a0
+; RV32-NEXT: vmv.x.s a3, v9
+; RV32-NEXT: addi a0, a0, 1
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: bne a0, a2, .LBB118_1
+; RV32-NEXT: # %bb.2: # %rdx.exit
+; RV32-NEXT: slli a0, a1, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_mul_nxv2i16:
+; RV64: # %bb.0:
+; RV64-NEXT: li a0, 0
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: srli a2, a2, 2
+; RV64-NEXT: li a1, 1
+; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64-NEXT: .LBB118_1: # %rdx.loop
+; RV64-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT: vslidedown.vx v9, v8, a0
+; RV64-NEXT: vmv.x.s a3, v9
+; RV64-NEXT: addi a0, a0, 1
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: bne a0, a2, .LBB118_1
+; RV64-NEXT: # %bb.2: # %rdx.exit
+; RV64-NEXT: slli a0, a1, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
+ %red = call i16 @llvm.vector.reduce.mul.nxv2i16(<vscale x 2 x i16> %v)
+ ret i16 %red
+}
+
+define signext i16 @vreduce_mul_nxv4i16(<vscale x 4 x i16> %v) {
+; RV32-LABEL: vreduce_mul_nxv4i16:
+; RV32: # %bb.0:
+; RV32-NEXT: li a0, 0
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: srli a2, a2, 1
+; RV32-NEXT: li a1, 1
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: .LBB119_1: # %rdx.loop
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: vslidedown.vx v9, v8, a0
+; RV32-NEXT: vmv.x.s a3, v9
+; RV32-NEXT: addi a0, a0, 1
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: bne a0, a2, .LBB119_1
+; RV32-NEXT: # %bb.2: # %rdx.exit
+; RV32-NEXT: slli a0, a1, 16
+; RV32-NEXT: srai a0, a0, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vreduce_mul_nxv4i16:
+; RV64: # %bb.0:
+; RV64-NEXT: li a0, 0
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: srli a2, a2, 1
+; RV64-NEXT: li a1, 1
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: .LBB119_1: # %rdx.loop
+; RV64-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64-NEXT: vslidedown.vx v9, v8, a0
+; RV64-NEXT: vmv.x.s a3, v9
+; RV64-NEXT: addi a0, a0, 1
+; RV64-NEXT: mul a1, a1, a3
+; RV64-NEXT: bne a0, a2, .LBB119_1
+; RV64-NEXT: # %bb.2: # %rdx.exit
+; RV64-NEXT: slli a0, a1, 48
+; RV64-NEXT: srai a0, a0, 48
+; RV64-NEXT: ret
+ %red = call i16 @llvm.vector.reduce.mul.nxv4i16(<vscale x 4 x i16> %v)
+ ret i16 %red
+}
+
+define signext i32 @vreduce_mul_nxv1i32(<vscale x 1 x i32> %v) {
+; RV32-LABEL: vreduce_mul_nxv1i32:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: srli a2, a2, 3
+; RV32-NEXT: li a0, 1
+; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT: .LBB120_1: # %rdx.loop
+; RV32-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32-NEXT: vslidedown.vx v9, v8, a1
+; RV32-NEXT: vmv.x.s a3, v9
+; RV32-NEXT: addi a1, a1, 1
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: bne a1, a2, .LBB120_1
+; RV32-NEX...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
It turns out we already support use of the __builtin_reduce_ family of builtins on the builtin RVV types, but we have no test coverage which demonstrates this. Note that __builtin_reduce_mul is a bit of a cornercase as currently the clang part works just fine, but the lowering will crash since we don't have a vredprod-esq instruction. (See #193094 for the lowering fix.)
…FC] (#193082) It turns out we already support use of the __builtin_reduce_ family of builtins on the builtin RVV types, but we have no test coverage which demonstrates this. Note that __builtin_reduce_mul is a bit of a cornercase as currently the clang part works just fine, but the lowering will crash since we don't have a vredprod-esq instruction. (See llvm/llvm-project#193094 for the lowering fix.)
…FC] (#193082) It turns out we already support use of the __builtin_reduce_ family of builtins on the builtin RVV types, but we have no test coverage which demonstrates this. Note that __builtin_reduce_mul is a bit of a cornercase as currently the clang part works just fine, but the lowering will crash since we don't have a vredprod-esq instruction. (See llvm/llvm-project#193094 for the lowering fix.)
| @@ -75,6 +75,10 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { | |||
| Value *Acc = II->getArgOperand(0); | |||
| Value *Vec = II->getArgOperand(1); | |||
| unsigned RdxOpcode = getArithmeticReductionInstruction(ID); | |||
| if (isa<ScalableVectorType>(Vec->getType())) { | |||
There was a problem hiding this comment.
Should we check the ID? It seems we will also expand Intrinsic::vector_reduce_fadd? But it is weird that there is no test change?
| @@ -125,10 +129,16 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { | |||
| case Intrinsic::vector_reduce_umax: | |||
| case Intrinsic::vector_reduce_umin: { | |||
| Value *Vec = II->getArgOperand(0); | |||
| unsigned RdxOpcode = getArithmeticReductionInstruction(ID); | |||
| if (isa<ScalableVectorType>(Vec->getType())) { | |||
RVV does not have an instruction for performing a horizontal multiply reduction (either integer or floating point). However, a user of clang can explicitly write at least the integer form via the __builtin_reduce_mul construct, and currently we just crash when compiling this.
This change converts the crash into functionally correct scalar loop to process each element one by one at runtime. This will be slow, but at least correct.
Note that to my knowledge we can't generate the floating point one directly from C, but I decided to handle both for completeness while I was here.
Written by Claude Code with guidance and review by me.