[llvm] [RISCV] Vectorize phi for loop carried @llvm.vp.reduce.* (PR #131974)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 25 18:52:16 PDT 2025
https://github.com/NexMing updated https://github.com/llvm/llvm-project/pull/131974
>From 2118b7894c6efa6da1a27f3a4fb5f8bb2d8feca5 Mon Sep 17 00:00:00 2001
From: yanming <ming.yan at terapines.com>
Date: Fri, 21 Mar 2025 15:56:03 +0800
Subject: [PATCH 1/3] add testcases
---
.../RISCV/rvv/riscv-codegenprepare-asm.ll | 493 ++++++++++++++++++
.../CodeGen/RISCV/rvv/riscv-codegenprepare.ll | 462 ++++++++++++++++
2 files changed, 955 insertions(+)
diff --git a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
index 3bbdd1a257fdb..699e6f34a211a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
@@ -42,3 +42,496 @@ vector.body:
exit:
ret float %acc
}
+
+define i32 @vp_reduce_add(ptr %a) {
+; CHECK-LABEL: vp_reduce_add:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a2, 0
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: .LBB1_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mv a4, a3
+; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
+; CHECK-NEXT: slli a6, a2, 2
+; CHECK-NEXT: add a6, a0, a6
+; CHECK-NEXT: vle32.v v8, (a6)
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: sub a3, a3, a5
+; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
+; CHECK-NEXT: vredsum.vs v10, v8, v10
+; CHECK-NEXT: vmv.x.s a1, v10
+; CHECK-NEXT: add a2, a2, a5
+; CHECK-NEXT: bnez a3, .LBB1_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_and(ptr %a) {
+; CHECK-LABEL: vp_reduce_and:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a2, 0
+; CHECK-NEXT: lui a1, 524288
+; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: .LBB2_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mv a4, a3
+; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
+; CHECK-NEXT: slli a6, a2, 2
+; CHECK-NEXT: add a6, a0, a6
+; CHECK-NEXT: vle32.v v8, (a6)
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: sub a3, a3, a5
+; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
+; CHECK-NEXT: vredand.vs v10, v8, v10
+; CHECK-NEXT: vmv.x.s a1, v10
+; CHECK-NEXT: add a2, a2, a5
+; CHECK-NEXT: bnez a3, .LBB2_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_or(ptr %a) {
+; CHECK-LABEL: vp_reduce_or:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a2, 0
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: .LBB3_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mv a4, a3
+; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
+; CHECK-NEXT: slli a6, a2, 2
+; CHECK-NEXT: add a6, a0, a6
+; CHECK-NEXT: vle32.v v8, (a6)
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: sub a3, a3, a5
+; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
+; CHECK-NEXT: vredor.vs v10, v8, v10
+; CHECK-NEXT: vmv.x.s a1, v10
+; CHECK-NEXT: add a2, a2, a5
+; CHECK-NEXT: bnez a3, .LBB3_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_xor(ptr %a) {
+; CHECK-LABEL: vp_reduce_xor:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a2, 0
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: .LBB4_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mv a4, a3
+; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
+; CHECK-NEXT: slli a6, a2, 2
+; CHECK-NEXT: add a6, a0, a6
+; CHECK-NEXT: vle32.v v8, (a6)
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: sub a3, a3, a5
+; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
+; CHECK-NEXT: vredxor.vs v10, v8, v10
+; CHECK-NEXT: vmv.x.s a1, v10
+; CHECK-NEXT: add a2, a2, a5
+; CHECK-NEXT: bnez a3, .LBB4_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_smax(ptr %a) {
+; CHECK-LABEL: vp_reduce_smax:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a2, 0
+; CHECK-NEXT: lui a1, 524288
+; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: .LBB5_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mv a4, a3
+; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
+; CHECK-NEXT: slli a6, a2, 2
+; CHECK-NEXT: add a6, a0, a6
+; CHECK-NEXT: vle32.v v8, (a6)
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: sub a3, a3, a5
+; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
+; CHECK-NEXT: vredmax.vs v10, v8, v10
+; CHECK-NEXT: vmv.x.s a1, v10
+; CHECK-NEXT: add a2, a2, a5
+; CHECK-NEXT: bnez a3, .LBB5_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_smin(ptr %a) {
+; CHECK-LABEL: vp_reduce_smin:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a2, 0
+; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: lui a1, 524288
+; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: .LBB6_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mv a4, a3
+; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
+; CHECK-NEXT: slli a6, a2, 2
+; CHECK-NEXT: add a6, a0, a6
+; CHECK-NEXT: vle32.v v8, (a6)
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: sub a3, a3, a5
+; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
+; CHECK-NEXT: vredmin.vs v10, v8, v10
+; CHECK-NEXT: vmv.x.s a1, v10
+; CHECK-NEXT: add a2, a2, a5
+; CHECK-NEXT: bnez a3, .LBB6_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 2147483647, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_umax(ptr %a) {
+; CHECK-LABEL: vp_reduce_umax:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a2, 0
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: .LBB7_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mv a4, a3
+; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
+; CHECK-NEXT: slli a6, a2, 2
+; CHECK-NEXT: add a6, a0, a6
+; CHECK-NEXT: vle32.v v8, (a6)
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: sub a3, a3, a5
+; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
+; CHECK-NEXT: vredmaxu.vs v10, v8, v10
+; CHECK-NEXT: vmv.x.s a1, v10
+; CHECK-NEXT: add a2, a2, a5
+; CHECK-NEXT: bnez a3, .LBB7_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_umin(ptr %a) {
+; CHECK-LABEL: vp_reduce_umin:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a2, 0
+; CHECK-NEXT: lui a1, 524288
+; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: .LBB8_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mv a4, a3
+; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
+; CHECK-NEXT: slli a6, a2, 2
+; CHECK-NEXT: add a6, a0, a6
+; CHECK-NEXT: vle32.v v8, (a6)
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: sub a3, a3, a5
+; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
+; CHECK-NEXT: vredminu.vs v10, v8, v10
+; CHECK-NEXT: vmv.x.s a1, v10
+; CHECK-NEXT: add a2, a2, a5
+; CHECK-NEXT: bnez a3, .LBB8_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define float @vp_reduce_fadd(ptr %a) {
+; CHECK-LABEL: vp_reduce_fadd:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: fmv.w.x fa0, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB9_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: vsetvli a4, a2, e8, mf2, ta, ma
+; CHECK-NEXT: slli a5, a1, 2
+; CHECK-NEXT: add a5, a0, a5
+; CHECK-NEXT: vle32.v v8, (a5)
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vfmv.s.f v10, fa0
+; CHECK-NEXT: sub a2, a2, a4
+; CHECK-NEXT: vsetvli zero, a3, e32, m2, ta, ma
+; CHECK-NEXT: vfredosum.vs v10, v8, v10
+; CHECK-NEXT: vfmv.f.s fa0, v10
+; CHECK-NEXT: add a1, a1, a4
+; CHECK-NEXT: bnez a2, .LBB9_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fadd.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
+
+define float @vp_reduce_fmax(ptr %a) {
+; CHECK-LABEL: vp_reduce_fmax:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: fmv.w.x fa0, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB10_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: vsetvli a4, a2, e8, mf2, ta, ma
+; CHECK-NEXT: slli a5, a1, 2
+; CHECK-NEXT: add a5, a0, a5
+; CHECK-NEXT: vle32.v v8, (a5)
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vfmv.s.f v10, fa0
+; CHECK-NEXT: sub a2, a2, a4
+; CHECK-NEXT: vsetvli zero, a3, e32, m2, ta, ma
+; CHECK-NEXT: vfredmax.vs v10, v8, v10
+; CHECK-NEXT: vfmv.f.s fa0, v10
+; CHECK-NEXT: add a1, a1, a4
+; CHECK-NEXT: bnez a2, .LBB10_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fmax.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
+
+define float @vp_reduce_fmin(ptr %a) {
+; CHECK-LABEL: vp_reduce_fmin:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: fmv.w.x fa0, zero
+; CHECK-NEXT: li a2, 1024
+; CHECK-NEXT: .LBB11_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: mv a3, a2
+; CHECK-NEXT: vsetvli a4, a2, e8, mf2, ta, ma
+; CHECK-NEXT: slli a5, a1, 2
+; CHECK-NEXT: add a5, a0, a5
+; CHECK-NEXT: vle32.v v8, (a5)
+; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; CHECK-NEXT: vfmv.s.f v10, fa0
+; CHECK-NEXT: sub a2, a2, a4
+; CHECK-NEXT: vsetvli zero, a3, e32, m2, ta, ma
+; CHECK-NEXT: vfredmin.vs v10, v8, v10
+; CHECK-NEXT: vfmv.f.s fa0, v10
+; CHECK-NEXT: add a1, a1, a4
+; CHECK-NEXT: bnez a2, .LBB11_1
+; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fmin.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll
index 006fc269050b0..95346e75db632 100644
--- a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll
@@ -44,3 +44,465 @@ vector.body:
exit:
ret float %acc
}
+
+define i32 @vp_reduce_add(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_add(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_and(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_and(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ -2147483648, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_or(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_or(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_xor(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_xor(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_smax(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_smax(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ -2147483648, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_smin(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_smin(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 2147483647, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 2147483647, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_umax(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_umax(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define i32 @vp_reduce_umin(ptr %a) {
+; CHECK-LABEL: define i32 @vp_reduce_umin(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ -2147483648, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret i32 [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 %red.phi, <vscale x 4 x i32> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret i32 %red
+}
+
+define float @vp_reduce_fadd(ptr %a) {
+; CHECK-LABEL: define float @vp_reduce_fadd(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[RED]] = tail call float @llvm.vp.reduce.fadd.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret float [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fadd.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
+
+define float @vp_reduce_fmax(ptr %a) {
+; CHECK-LABEL: define float @vp_reduce_fmax(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[RED]] = tail call float @llvm.vp.reduce.fmax.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret float [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fmax.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
+
+define float @vp_reduce_fmin(ptr %a) {
+; CHECK-LABEL: define float @vp_reduce_fmin(
+; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
+; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[RED]] = tail call float @llvm.vp.reduce.fmin.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
+; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
+; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret float [[RED]]
+;
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ]
+ %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ]
+ %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ]
+ %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true)
+ %evl2 = zext i32 %evl to i64
+ %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind
+ %wide.load = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %red = tail call float @llvm.vp.reduce.fmin.nxv4f32(float %red.phi, <vscale x 4 x float> %wide.load, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %remaining.trip.count = sub nuw i64 %trip.count, %evl2
+ %next.ind = add i64 %scalar.ind, %evl2
+ %m = icmp eq i64 %remaining.trip.count, 0
+ br i1 %m, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret float %red
+}
>From 5d0035dbfebe6e4e5b38d5e8a63dd218e27ba326 Mon Sep 17 00:00:00 2001
From: yanming <ming.yan at terapines.com>
Date: Fri, 21 Mar 2025 15:56:48 +0800
Subject: [PATCH 2/3] [RISCV] Vectorize phi for loop carried @llvm.vp.reduce.*
This patch is vector predication version of commit
15b0fabb21af8395c1b810e7d992a869b9ef31d8
---
llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 3 +-
.../RISCV/rvv/riscv-codegenprepare-asm.ll | 307 ++++++++----------
.../CodeGen/RISCV/rvv/riscv-codegenprepare.ll | 66 ++--
3 files changed, 181 insertions(+), 195 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 5be5345cca73a..39877fb511ec3 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -137,7 +137,8 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
if (expandVPStrideLoad(I))
return true;
- if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd)
+ if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd &&
+ !isa<VPReductionIntrinsic>(&I))
return false;
auto *PHI = dyn_cast<PHINode>(I.getOperand(0));
diff --git a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
index 699e6f34a211a..4e5f6e0f65489 100644
--- a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
@@ -46,26 +46,22 @@ exit:
define i32 @vp_reduce_add(ptr %a) {
; CHECK-LABEL: vp_reduce_add:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: li a2, 0
; CHECK-NEXT: li a1, 0
-; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: .LBB1_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mv a4, a3
-; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
-; CHECK-NEXT: slli a6, a2, 2
-; CHECK-NEXT: add a6, a0, a6
-; CHECK-NEXT: vle32.v v8, (a6)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v10, a1
-; CHECK-NEXT: sub a3, a3, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
-; CHECK-NEXT: vredsum.vs v10, v8, v10
-; CHECK-NEXT: vmv.x.s a1, v10
-; CHECK-NEXT: add a2, a2, a5
-; CHECK-NEXT: bnez a3, .LBB1_1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredsum.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB1_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
br label %vector.body
@@ -91,26 +87,23 @@ for.cond.cleanup: ; preds = %vector.body
define i32 @vp_reduce_and(ptr %a) {
; CHECK-LABEL: vp_reduce_and:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: li a2, 0
-; CHECK-NEXT: lui a1, 524288
-; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: .LBB2_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mv a4, a3
-; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
-; CHECK-NEXT: slli a6, a2, 2
-; CHECK-NEXT: add a6, a0, a6
-; CHECK-NEXT: vle32.v v8, (a6)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v10, a1
-; CHECK-NEXT: sub a3, a3, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
-; CHECK-NEXT: vredand.vs v10, v8, v10
-; CHECK-NEXT: vmv.x.s a1, v10
-; CHECK-NEXT: add a2, a2, a5
-; CHECK-NEXT: bnez a3, .LBB2_1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredand.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB2_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
br label %vector.body
@@ -136,26 +129,22 @@ for.cond.cleanup: ; preds = %vector.body
define i32 @vp_reduce_or(ptr %a) {
; CHECK-LABEL: vp_reduce_or:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: li a2, 0
; CHECK-NEXT: li a1, 0
-; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: .LBB3_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mv a4, a3
-; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
-; CHECK-NEXT: slli a6, a2, 2
-; CHECK-NEXT: add a6, a0, a6
-; CHECK-NEXT: vle32.v v8, (a6)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v10, a1
-; CHECK-NEXT: sub a3, a3, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
-; CHECK-NEXT: vredor.vs v10, v8, v10
-; CHECK-NEXT: vmv.x.s a1, v10
-; CHECK-NEXT: add a2, a2, a5
-; CHECK-NEXT: bnez a3, .LBB3_1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredor.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB3_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
br label %vector.body
@@ -181,26 +170,22 @@ for.cond.cleanup: ; preds = %vector.body
define i32 @vp_reduce_xor(ptr %a) {
; CHECK-LABEL: vp_reduce_xor:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: li a2, 0
; CHECK-NEXT: li a1, 0
-; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: .LBB4_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mv a4, a3
-; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
-; CHECK-NEXT: slli a6, a2, 2
-; CHECK-NEXT: add a6, a0, a6
-; CHECK-NEXT: vle32.v v8, (a6)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v10, a1
-; CHECK-NEXT: sub a3, a3, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
-; CHECK-NEXT: vredxor.vs v10, v8, v10
-; CHECK-NEXT: vmv.x.s a1, v10
-; CHECK-NEXT: add a2, a2, a5
-; CHECK-NEXT: bnez a3, .LBB4_1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredxor.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB4_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
br label %vector.body
@@ -226,26 +211,23 @@ for.cond.cleanup: ; preds = %vector.body
define i32 @vp_reduce_smax(ptr %a) {
; CHECK-LABEL: vp_reduce_smax:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: li a2, 0
-; CHECK-NEXT: lui a1, 524288
-; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: .LBB5_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mv a4, a3
-; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
-; CHECK-NEXT: slli a6, a2, 2
-; CHECK-NEXT: add a6, a0, a6
-; CHECK-NEXT: vle32.v v8, (a6)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v10, a1
-; CHECK-NEXT: sub a3, a3, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
-; CHECK-NEXT: vredmax.vs v10, v8, v10
-; CHECK-NEXT: vmv.x.s a1, v10
-; CHECK-NEXT: add a2, a2, a5
-; CHECK-NEXT: bnez a3, .LBB5_1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredmax.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB5_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
br label %vector.body
@@ -271,27 +253,24 @@ for.cond.cleanup: ; preds = %vector.body
define i32 @vp_reduce_smin(ptr %a) {
; CHECK-LABEL: vp_reduce_smin:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: li a2, 0
-; CHECK-NEXT: li a3, 1024
-; CHECK-NEXT: lui a1, 524288
-; CHECK-NEXT: addi a1, a1, -1
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: addi a2, a2, -1
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: .LBB6_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mv a4, a3
-; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
-; CHECK-NEXT: slli a6, a2, 2
-; CHECK-NEXT: add a6, a0, a6
-; CHECK-NEXT: vle32.v v8, (a6)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v10, a1
-; CHECK-NEXT: sub a3, a3, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
-; CHECK-NEXT: vredmin.vs v10, v8, v10
-; CHECK-NEXT: vmv.x.s a1, v10
-; CHECK-NEXT: add a2, a2, a5
-; CHECK-NEXT: bnez a3, .LBB6_1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredmin.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB6_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
br label %vector.body
@@ -317,26 +296,22 @@ for.cond.cleanup: ; preds = %vector.body
define i32 @vp_reduce_umax(ptr %a) {
; CHECK-LABEL: vp_reduce_umax:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: li a2, 0
; CHECK-NEXT: li a1, 0
-; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: .LBB7_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mv a4, a3
-; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
-; CHECK-NEXT: slli a6, a2, 2
-; CHECK-NEXT: add a6, a0, a6
-; CHECK-NEXT: vle32.v v8, (a6)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v10, a1
-; CHECK-NEXT: sub a3, a3, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
-; CHECK-NEXT: vredmaxu.vs v10, v8, v10
-; CHECK-NEXT: vmv.x.s a1, v10
-; CHECK-NEXT: add a2, a2, a5
-; CHECK-NEXT: bnez a3, .LBB7_1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredmaxu.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB7_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
br label %vector.body
@@ -362,26 +337,23 @@ for.cond.cleanup: ; preds = %vector.body
define i32 @vp_reduce_umin(ptr %a) {
; CHECK-LABEL: vp_reduce_umin:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: li a2, 0
-; CHECK-NEXT: lui a1, 524288
-; CHECK-NEXT: li a3, 1024
+; CHECK-NEXT: li a1, 0
+; CHECK-NEXT: lui a2, 524288
+; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a2
+; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: .LBB8_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mv a4, a3
-; CHECK-NEXT: vsetvli a5, a3, e8, mf2, ta, ma
-; CHECK-NEXT: slli a6, a2, 2
-; CHECK-NEXT: add a6, a0, a6
-; CHECK-NEXT: vle32.v v8, (a6)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v10, a1
-; CHECK-NEXT: sub a3, a3, a5
-; CHECK-NEXT: vsetvli zero, a4, e32, m2, ta, ma
-; CHECK-NEXT: vredminu.vs v10, v8, v10
-; CHECK-NEXT: vmv.x.s a1, v10
-; CHECK-NEXT: add a2, a2, a5
-; CHECK-NEXT: bnez a3, .LBB8_1
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vredminu.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
+; CHECK-NEXT: bnez a2, .LBB8_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
-; CHECK-NEXT: mv a0, a1
+; CHECK-NEXT: vmv.x.s a0, v8
; CHECK-NEXT: ret
entry:
br label %vector.body
@@ -408,24 +380,21 @@ define float @vp_reduce_fadd(ptr %a) {
; CHECK-LABEL: vp_reduce_fadd:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: li a1, 0
-; CHECK-NEXT: fmv.w.x fa0, zero
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: .LBB9_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: vsetvli a4, a2, e8, mf2, ta, ma
-; CHECK-NEXT: slli a5, a1, 2
-; CHECK-NEXT: add a5, a0, a5
-; CHECK-NEXT: vle32.v v8, (a5)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vfmv.s.f v10, fa0
-; CHECK-NEXT: sub a2, a2, a4
-; CHECK-NEXT: vsetvli zero, a3, e32, m2, ta, ma
-; CHECK-NEXT: vfredosum.vs v10, v8, v10
-; CHECK-NEXT: vfmv.f.s fa0, v10
-; CHECK-NEXT: add a1, a1, a4
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vfredosum.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: bnez a2, .LBB9_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
entry:
br label %vector.body
@@ -452,24 +421,21 @@ define float @vp_reduce_fmax(ptr %a) {
; CHECK-LABEL: vp_reduce_fmax:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: li a1, 0
-; CHECK-NEXT: fmv.w.x fa0, zero
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: .LBB10_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: vsetvli a4, a2, e8, mf2, ta, ma
-; CHECK-NEXT: slli a5, a1, 2
-; CHECK-NEXT: add a5, a0, a5
-; CHECK-NEXT: vle32.v v8, (a5)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vfmv.s.f v10, fa0
-; CHECK-NEXT: sub a2, a2, a4
-; CHECK-NEXT: vsetvli zero, a3, e32, m2, ta, ma
-; CHECK-NEXT: vfredmax.vs v10, v8, v10
-; CHECK-NEXT: vfmv.f.s fa0, v10
-; CHECK-NEXT: add a1, a1, a4
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vfredmax.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: bnez a2, .LBB10_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
entry:
br label %vector.body
@@ -496,24 +462,21 @@ define float @vp_reduce_fmin(ptr %a) {
; CHECK-LABEL: vp_reduce_fmin:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: li a1, 0
-; CHECK-NEXT: fmv.w.x fa0, zero
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v8, zero
; CHECK-NEXT: li a2, 1024
; CHECK-NEXT: .LBB11_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: vsetvli a4, a2, e8, mf2, ta, ma
-; CHECK-NEXT: slli a5, a1, 2
-; CHECK-NEXT: add a5, a0, a5
-; CHECK-NEXT: vle32.v v8, (a5)
-; CHECK-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-NEXT: vfmv.s.f v10, fa0
-; CHECK-NEXT: sub a2, a2, a4
-; CHECK-NEXT: vsetvli zero, a3, e32, m2, ta, ma
-; CHECK-NEXT: vfredmin.vs v10, v8, v10
-; CHECK-NEXT: vfmv.f.s fa0, v10
-; CHECK-NEXT: add a1, a1, a4
+; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma
+; CHECK-NEXT: slli a4, a1, 2
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: sub a2, a2, a3
+; CHECK-NEXT: vfredmin.vs v8, v10, v8
+; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: bnez a2, .LBB11_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
entry:
br label %vector.body
diff --git a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll
index 95346e75db632..8967fb8bf01ac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll
@@ -53,15 +53,17 @@ define i32 @vp_reduce_add(ptr %a) {
; CHECK: vector.body:
; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret i32 [[RED]]
@@ -95,15 +97,17 @@ define i32 @vp_reduce_and(ptr %a) {
; CHECK: vector.body:
; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ -2147483648, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret i32 [[RED]]
@@ -137,15 +141,17 @@ define i32 @vp_reduce_or(ptr %a) {
; CHECK: vector.body:
; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret i32 [[RED]]
@@ -179,15 +185,17 @@ define i32 @vp_reduce_xor(ptr %a) {
; CHECK: vector.body:
; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret i32 [[RED]]
@@ -221,15 +229,17 @@ define i32 @vp_reduce_smax(ptr %a) {
; CHECK: vector.body:
; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ -2147483648, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret i32 [[RED]]
@@ -263,15 +273,17 @@ define i32 @vp_reduce_smin(ptr %a) {
; CHECK: vector.body:
; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 2147483647, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret i32 [[RED]]
@@ -305,15 +317,17 @@ define i32 @vp_reduce_umax(ptr %a) {
; CHECK: vector.body:
; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret i32 [[RED]]
@@ -347,15 +361,17 @@ define i32 @vp_reduce_umin(ptr %a) {
; CHECK: vector.body:
; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ -2147483648, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT: [[RED]] = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x i32> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 [[TMP1]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x i32> poison, i32 [[RED]], i64 0
; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret i32 [[RED]]
@@ -389,15 +405,17 @@ define float @vp_reduce_fadd(ptr %a) {
; CHECK: vector.body:
; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> poison, float 0.000000e+00, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT: [[RED]] = tail call float @llvm.vp.reduce.fadd.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x float> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call float @llvm.vp.reduce.fadd.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x float> poison, float [[RED]], i64 0
; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret float [[RED]]
@@ -431,15 +449,17 @@ define float @vp_reduce_fmax(ptr %a) {
; CHECK: vector.body:
; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> poison, float 0.000000e+00, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT: [[RED]] = tail call float @llvm.vp.reduce.fmax.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x float> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call float @llvm.vp.reduce.fmax.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x float> poison, float [[RED]], i64 0
; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret float [[RED]]
@@ -473,15 +493,17 @@ define float @vp_reduce_fmin(ptr %a) {
; CHECK: vector.body:
; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> poison, float 0.000000e+00, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true)
; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT: [[RED]] = tail call float @llvm.vp.reduce.fmin.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <vscale x 4 x float> [[TMP0]], i64 0
+; CHECK-NEXT: [[RED:%.*]] = tail call float @llvm.vp.reduce.fmin.nxv4f32(float [[TMP1]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]]
; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]]
; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0
+; CHECK-NEXT: [[TMP2]] = insertelement <vscale x 4 x float> poison, float [[RED]], i64 0
; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret float [[RED]]
>From 26da6aeb340b72e88e4386afebcd828ae710a22e Mon Sep 17 00:00:00 2001
From: yanming <ming.yan at terapines.com>
Date: Wed, 26 Mar 2025 09:49:03 +0800
Subject: [PATCH 3/3] Update the comment
---
llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 39877fb511ec3..b5cb05f30fb26 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -113,9 +113,10 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
// vfredosum.vs v8, v8, v10
// vfmv.f.s fa0, v8
//
-// This mainly affects ordered fadd reductions, since other types of reduction
-// typically use element-wise vectorisation in the loop body. This tries to
-// vectorize any scalar phis that feed into a fadd reduction:
+// This mainly affects ordered fadd reductions and VP reductions that have a
+// scalar start value, since other types of reduction typically use element-wise
+// vectorisation in the loop body. This tries to vectorize any scalar phis that
+// feed into these reductions:
//
// loop:
// %phi = phi <float> [ ..., %entry ], [ %acc, %loop ]
More information about the llvm-commits
mailing list