[llvm] 5c65a32 - [RISCV] Vectorize phi for loop carried @llvm.vp.reduce.* (#131974)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 31 01:13:50 PDT 2025
Author: MingYan
Date: 2025-03-31T09:13:46+01:00
New Revision: 5c65a321778b99f745d193629975fb6ced34fe07
URL: https://github.com/llvm/llvm-project/commit/5c65a321778b99f745d193629975fb6ced34fe07
DIFF: https://github.com/llvm/llvm-project/commit/5c65a321778b99f745d193629975fb6ced34fe07.diff
LOG: [RISCV] Vectorize phi for loop carried @llvm.vp.reduce.* (#131974)
LLVM vector predication reduction intrinsics return a scalar result, but
on RISC-V vector reduction instructions write the result in the first
element of a vector register. So when a reduction in a loop uses a
scalar phi, we end up with unnecessary scalar moves:
```asm
loop:
vmv.s.x v8, zero
vredsum.vs v8, v10, v8
vmv.x.s a0, v8
````
This mainly affects vector predication reduction. This tries to
vectorize any scalar phis that feed into a vector predication reduction
in RISCVCodeGenPrepare, converting:
```llvm
vector.body:
%red.phi = phi i32 [ ..., %entry ], [ %red, %vector.body ]
%red = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
```
to
```llvm
vector.body:
%red.phi = phi <vscale x 2 x i32> [ ..., %entry ], [ %acc.vec, %vector.body]
%phi.scalar = extractelement <vscale x 2 x i32> %red.phi, i64 0
%acc = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %phi.scalar, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
%acc.vec = insertelement <vscale x 2 x i32> poison, float %acc, i64 0
```
Which eliminates the scalar -> vector -> scalar crossing during
instruction selection.
---------
Co-authored-by: yanming <ming.yan at terapines.com>
Added:
Modified:
llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 5be5345cca73a..b5cb05f30fb26 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -113,9 +113,10 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
// vfredosum.vs v8, v8, v10
// vfmv.f.s fa0, v8
//
-// This mainly affects ordered fadd reductions, since other types of reduction
-// typically use element-wise vectorisation in the loop body. This tries to
-// vectorize any scalar phis that feed into a fadd reduction:
+// This mainly affects ordered fadd reductions and VP reductions that have a
+// scalar start value, since other types of reduction typically use element-wise
+// vectorisation in the loop body. This tries to vectorize any scalar phis that
+// feed into these reductions:
//
// loop:
// %phi = phi <float> [ ..., %entry ], [ %acc, %loop ]
@@ -137,7 +138,8 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
if (expandVPStrideLoad(I))
return true;
- if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd)
+ if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd &&
+ !isa<VPReductionIntrinsic>(&I))
return false;
auto *PHI = dyn_cast<PHINode>(I.getOperand(0));
diff --git a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
index 3bbdd1a257fdb..4e5f6e0f65489 100644
--- a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
@@ -42,3 +42,459 @@ vector.body:
exit:
ret float %acc
}
+
+define i32 @vp_reduce_add(ptr %a) {
+; CHECK-LABEL: vp_reduce_add:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB1_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredsum.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB1_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_and(ptr %a) {
+; CHECK-LABEL: vp_reduce_and:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB2_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredand.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB2_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_or(ptr %a) {
+; CHECK-LABEL: vp_reduce_or:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB3_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredor.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB3_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_xor(ptr %a) {
+; CHECK-LABEL: vp_reduce_xor:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB4_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredxor.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB4_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_smax(ptr %a) {
+; CHECK-LABEL: vp_reduce_smax:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB5_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredmax.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB5_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_smin(ptr %a) {
+; CHECK-LABEL: vp_reduce_smin:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB6_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredmin.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB6_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 2147483647, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_umax(ptr %a) {
+; CHECK-LABEL: vp_reduce_umax:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB7_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredmaxu.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB7_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_umin(ptr %a) {
+; CHECK-LABEL: vp_reduce_umin:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB8_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredminu.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB8_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define float @vp_reduce_fadd(ptr %a) {
+; CHECK-LABEL: vp_reduce_fadd:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB9_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vfredosum.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB9_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fadd.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
+
+define float @vp_reduce_fmax(ptr %a) {
+; CHECK-LABEL: vp_reduce_fmax:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB10_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vfredmax.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB10_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fmax.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
+
+define float @vp_reduce_fmin(ptr %a) {
+; CHECK-LABEL: vp_reduce_fmin:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB11_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vfredmin.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB11_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fmin.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll
index 006fc269050b0..8967fb8bf01ac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll
@@ -44,3 +44,487 @@ vector.body:
exit:
ret float %acc
}
+
+define i32 @vp_reduce_add(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_add(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_and(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_and(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_or(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_or(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_xor(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_xor(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_smax(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_smax(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_smin(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_smin(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 2147483647, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_umax(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_umax(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_umin(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_umin(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define float @vp_reduce_fadd(ptr %a) {
+; CHECK-LABEL: define float @vp_reduce_fadd(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> poison, float 0.000000e+00, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x float> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call float @llvm.vp.reduce.fadd.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x float> poison, float [[RED]], i64 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret float [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fadd.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
+
+define float @vp_reduce_fmax(ptr %a) {
+; CHECK-LABEL: define float @vp_reduce_fmax(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> poison, float 0.000000e+00, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x float> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call float @llvm.vp.reduce.fmax.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x float> poison, float [[RED]], i64 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret float [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fmax.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
+
+define float @vp_reduce_fmin(ptr %a) {
+; CHECK-LABEL: define float @vp_reduce_fmin(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> poison, float 0.000000e+00, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x float> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call float @llvm.vp.reduce.fmin.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x float> poison, float [[RED]], i64 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret float [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fmin.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
More information about the llvm-commits
mailing list