[llvm] [RISCV] Widen i1 AnyOf reductions (PR #134898)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 21 03:02:31 PDT 2025
https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/134898
>From f590430cc01775983249c32c5ab08abe40de7ef8 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 8 Apr 2025 15:58:43 +0100
Subject: [PATCH 1/3] Precommit tests
---
.../CodeGen/RISCV/riscv-codegenprepare-asm.ll | 102 +++++++++++++++++-
.../CodeGen/RISCV/riscv-codegenprepare.ll | 90 ++++++++++++++++
2 files changed, 191 insertions(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
index 32261ee47164e..d3db332e1dd51 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv64 | FileCheck %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
; Make sure we don't emit a pair of shift for the zext in the preheader. We
@@ -127,3 +127,103 @@ for.body: ; preds = %for.body, %for.body
%niter.ncmp.1 = icmp eq i64 %niter.next.1, %unroll_iter
br i1 %niter.ncmp.1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
}
+
+define i1 @widen_anyof_rdx(ptr %p, i64 %n) {
+; CHECK-LABEL: widen_anyof_rdx:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a2, 0
+; CHECK-NEXT: vsetvli a3, zero, e64, m4, ta, ma
+; CHECK-NEXT: vmclr.m v12
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: .LBB2_1: # %loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub a3, a1, a2
+; CHECK-NEXT: slli a4, a2, 2
+; CHECK-NEXT: vsetvli a3, a3, e8, mf2, ta, ma
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v14, (a4)
+; CHECK-NEXT: vsetvli a4, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v13, v14, 0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vmsltu.vx v14, v8, a3
+; CHECK-NEXT: vmand.mm v13, v13, v14
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: vmor.mm v12, v12, v13
+; CHECK-NEXT: blt a2, a1, .LBB2_1
+; CHECK-NEXT: # %bb.2: # %exit
+; CHECK-NEXT: vcpop.m a0, v12
+; CHECK-NEXT: snez a0, a0
+; CHECK-NEXT: ret
+entry:
+ br label %loop
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %phi = phi <vscale x 4 x i1> [ zeroinitializer, %entry ], [ %rec, %loop ]
+ %avl = sub i64 %n, %iv
+ %evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true)
+
+ %gep = getelementptr i32, ptr %p, i64 %iv
+ %x = call <vscale x 4 x i32> @llvm.vp.load(ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %cmp = icmp ne <vscale x 4 x i32> %x, zeroinitializer
+ %rec = call <vscale x 4 x i1> @llvm.vp.merge(<vscale x 4 x i1> %cmp, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> %phi, i32 %evl)
+
+ %evl.zext = zext i32 %evl to i64
+ %iv.next = add i64 %iv, %evl.zext
+ %done = icmp sge i64 %iv.next, %n
+ br i1 %done, label %exit, label %loop
+exit:
+ %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
+ ret i1 %res
+}
+
+
+define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) {
+; CHECK-LABEL: widen_anyof_rdx_use_in_loop:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: li a2, 0
+; CHECK-NEXT: vsetvli a3, zero, e64, m4, ta, ma
+; CHECK-NEXT: vmclr.m v12
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: .LBB3_1: # %loop
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub a3, a1, a2
+; CHECK-NEXT: slli a4, a2, 2
+; CHECK-NEXT: vsetvli a3, a3, e8, mf2, ta, ma
+; CHECK-NEXT: add a4, a0, a4
+; CHECK-NEXT: vle32.v v14, (a4)
+; CHECK-NEXT: vsetvli a5, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v13, v14, 0
+; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT: vmsltu.vx v14, v8, a3
+; CHECK-NEXT: vmand.mm v13, v13, v14
+; CHECK-NEXT: vmor.mm v12, v12, v13
+; CHECK-NEXT: add a2, a2, a3
+; CHECK-NEXT: vsm.v v12, (a4)
+; CHECK-NEXT: blt a2, a1, .LBB3_1
+; CHECK-NEXT: # %bb.2: # %exit
+; CHECK-NEXT: vcpop.m a0, v12
+; CHECK-NEXT: snez a0, a0
+; CHECK-NEXT: ret
+entry:
+ br label %loop
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %phi = phi <vscale x 4 x i1> [ zeroinitializer, %entry ], [ %rec, %loop ]
+ %avl = sub i64 %n, %iv
+ %evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true)
+
+ %gep = getelementptr i32, ptr %p, i64 %iv
+ %x = call <vscale x 4 x i32> @llvm.vp.load(ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %cmp = icmp ne <vscale x 4 x i32> %x, zeroinitializer
+ %rec = call <vscale x 4 x i1> @llvm.vp.merge(<vscale x 4 x i1> %cmp, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> %phi, i32 %evl)
+
+ store <vscale x 4 x i1> %rec, ptr %gep
+
+ %evl.zext = zext i32 %evl to i64
+ %iv.next = add i64 %iv, %evl.zext
+ %done = icmp sge i64 %iv.next, %n
+ br i1 %done, label %exit, label %loop
+exit:
+ %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
+ ret i1 %res
+}
diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
index 2179a0d26cf98..3555309695f26 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
@@ -103,3 +103,93 @@ define i64 @bug(i32 %x) {
%b = and i64 %a, 4294967295
ret i64 %b
}
+
+define i1 @widen_anyof_rdx(ptr %p, i64 %n) {
+; CHECK-LABEL: @widen_anyof_rdx(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]]
+; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]]
+; CHECK-NEXT: [[X:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne <vscale x 4 x i32> [[X]], zeroinitializer
+; CHECK-NEXT: [[TMP4]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[CMP]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[PHI]], i32 [[EVL]])
+; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]]
+; CHECK-NEXT: [[DONE:%.*]] = icmp sge i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK: exit:
+; CHECK-NEXT: [[RES:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP4]])
+; CHECK-NEXT: ret i1 [[RES]]
+;
+entry:
+ br label %loop
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %phi = phi <vscale x 4 x i1> [ zeroinitializer, %entry ], [ %rec, %loop ]
+ %avl = sub i64 %n, %iv
+ %evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true)
+
+ %gep = getelementptr i32, ptr %p, i64 %iv
+ %x = call <vscale x 4 x i32> @llvm.vp.load(ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %cmp = icmp ne <vscale x 4 x i32> %x, zeroinitializer
+ %rec = call <vscale x 4 x i1> @llvm.vp.merge(<vscale x 4 x i1> %cmp, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> %phi, i32 %evl)
+
+ %evl.zext = zext i32 %evl to i64
+ %iv.next = add i64 %iv, %evl.zext
+ %done = icmp sge i64 %iv.next, %n
+ br i1 %done, label %exit, label %loop
+exit:
+ %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
+ ret i1 %res
+}
+
+
+define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) {
+; CHECK-LABEL: @widen_anyof_rdx_use_in_loop(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[ENTRY]] ], [ [[REC:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]]
+; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]]
+; CHECK-NEXT: [[X:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne <vscale x 4 x i32> [[X]], zeroinitializer
+; CHECK-NEXT: [[REC]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[CMP]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[PHI]], i32 [[EVL]])
+; CHECK-NEXT: store <vscale x 4 x i1> [[REC]], ptr [[GEP]], align 1
+; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]]
+; CHECK-NEXT: [[DONE:%.*]] = icmp sge i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK: exit:
+; CHECK-NEXT: [[RES:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[REC]])
+; CHECK-NEXT: ret i1 [[RES]]
+;
+entry:
+ br label %loop
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %phi = phi <vscale x 4 x i1> [ zeroinitializer, %entry ], [ %rec, %loop ]
+ %avl = sub i64 %n, %iv
+ %evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true)
+
+ %gep = getelementptr i32, ptr %p, i64 %iv
+ %x = call <vscale x 4 x i32> @llvm.vp.load(ptr %gep, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+ %cmp = icmp ne <vscale x 4 x i32> %x, zeroinitializer
+ %rec = call <vscale x 4 x i1> @llvm.vp.merge(<vscale x 4 x i1> %cmp, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> %phi, i32 %evl)
+
+ store <vscale x 4 x i1> %rec, ptr %gep
+
+ %evl.zext = zext i32 %evl to i64
+ %iv.next = add i64 %iv, %evl.zext
+ %done = icmp sge i64 %iv.next, %n
+ br i1 %done, label %exit, label %loop
+exit:
+ %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
+ ret i1 %res
+}
>From 7b4100dc226cb12856c0c1749786fd113c593873 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 8 Apr 2025 18:56:50 +0100
Subject: [PATCH 2/3] [RISCV] Widen i1 AnyOf reductions
---
llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 75 +++++++++++++++++++
.../CodeGen/RISCV/riscv-codegenprepare-asm.ll | 48 ++++++------
.../CodeGen/RISCV/riscv-codegenprepare.ll | 10 ++-
3 files changed, 104 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index b5cb05f30fb26..d034d2c7270f8 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -25,6 +25,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -58,6 +59,7 @@ class RISCVCodeGenPrepare : public FunctionPass,
bool visitAnd(BinaryOperator &BO);
bool visitIntrinsicInst(IntrinsicInst &I);
bool expandVPStrideLoad(IntrinsicInst &I);
+ bool widenVPMerge(IntrinsicInst &I);
};
} // end anonymous namespace
@@ -103,6 +105,76 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
return true;
}
+// With EVL tail folding, an AnyOf reduction will generate an i1 vp.merge like
+// follows:
+//
+// loop:
+// %phi = phi <vscale x 4 x i1> [ zeroinitializer, %entry ], [ %rec, %loop ]
+// %cmp = icmp ...
+// %rec = call <vscale x 4 x i1> @llvm.vp.merge(%cmp, i1 true, %phi, %evl)
+// ...
+// middle:
+// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
+//
+// However RVV doesn't have any tail undisturbed mask instructions and so we
+// need a convoluted sequence of mask instructions to lower the i1 vp.merge: see
+// llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll.
+//
+// To avoid that this widens the i1 vp.merge to an i8 vp.merge, which will
+// usually be folded into a masked vor.vv.
+//
+// loop:
+// %phi = phi <vscale x 4 x i8> [ zeroinitializer, %entry ], [ %rec, %loop ]
+// %cmp = icmp ...
+// %rec = call <vscale x 4 x i8> @llvm.vp.merge(%cmp, i8 true, %phi, %evl)
+// %trunc = trunc <vscale x 4 x i8> %rec to <vscale x 4 x i1>
+// ...
+// middle:
+// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
+//
+// The trunc will normally be sunk outside of the loop, but even if there are
+// users inside the loop it is still profitable.
+bool RISCVCodeGenPrepare::widenVPMerge(IntrinsicInst &II) {
+ if (!II.getType()->getScalarType()->isIntegerTy(1))
+ return false;
+
+ Value *Mask, *True, *PhiV, *EVL;
+ using namespace PatternMatch;
+ if (!match(&II,
+ m_Intrinsic<Intrinsic::vp_merge>(m_Value(Mask), m_Value(True),
+ m_Value(PhiV), m_Value(EVL))))
+ return false;
+
+ auto *Phi = dyn_cast<PHINode>(PhiV);
+ if (!Phi || Phi->getNumUses() > 2 || Phi->getNumIncomingValues() != 2 ||
+ !match(Phi->getIncomingValue(0), m_Zero()) ||
+ Phi->getIncomingValue(1) != &II)
+ return false;
+
+ Type *WideTy =
+ VectorType::get(IntegerType::getInt8Ty(II.getContext()),
+ cast<VectorType>(II.getType())->getElementCount());
+
+ IRBuilder<> Builder(Phi);
+ PHINode *WidePhi = Builder.CreatePHI(WideTy, 2);
+ WidePhi->addIncoming(ConstantAggregateZero::get(WideTy),
+ Phi->getIncomingBlock(0));
+ Builder.SetInsertPoint(&II);
+ Value *WideTrue = Builder.CreateZExt(True, WideTy);
+ Value *WideMerge = Builder.CreateIntrinsic(Intrinsic::vp_merge, {WideTy},
+ {Mask, WideTrue, WidePhi, EVL});
+ WidePhi->addIncoming(WideMerge, Phi->getIncomingBlock(1));
+ Value *Trunc = Builder.CreateTrunc(WideMerge, II.getType());
+
+ II.replaceAllUsesWith(Trunc);
+
+ // Break the cycle and delete the old chain.
+ Phi->setIncomingValue(1, Phi->getIncomingValue(0));
+ llvm::RecursivelyDeleteTriviallyDeadInstructions(&II);
+
+ return true;
+}
+
// LLVM vector reduction intrinsics return a scalar result, but on RISC-V vector
// reduction instructions write the result in the first element of a vector
// register. So when a reduction in a loop uses a scalar phi, we end up with
@@ -138,6 +210,9 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
if (expandVPStrideLoad(I))
return true;
+ if (widenVPMerge(I))
+ return true;
+
if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd &&
!isa<VPReductionIntrinsic>(&I))
return false;
diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
index d3db332e1dd51..6136c321c08ca 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
@@ -132,26 +132,25 @@ define i1 @widen_anyof_rdx(ptr %p, i64 %n) {
; CHECK-LABEL: widen_anyof_rdx:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: li a2, 0
-; CHECK-NEXT: vsetvli a3, zero, e64, m4, ta, ma
-; CHECK-NEXT: vmclr.m v12
-; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: .LBB2_1: # %loop
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: sub a3, a1, a2
; CHECK-NEXT: slli a4, a2, 2
-; CHECK-NEXT: vsetvli a3, a3, e8, mf2, ta, ma
+; CHECK-NEXT: vsetvli a3, a3, e32, m2, ta, ma
; CHECK-NEXT: add a4, a0, a4
-; CHECK-NEXT: vle32.v v14, (a4)
-; CHECK-NEXT: vsetvli a4, zero, e32, m2, ta, ma
-; CHECK-NEXT: vmsne.vi v13, v14, 0
-; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT: vmsltu.vx v14, v8, a3
-; CHECK-NEXT: vmand.mm v13, v13, v14
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: vmsne.vi v0, v10, 0
; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: vmor.mm v12, v12, v13
+; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: blt a2, a1, .LBB2_1
; CHECK-NEXT: # %bb.2: # %exit
-; CHECK-NEXT: vcpop.m a0, v12
+; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vand.vi v8, v8, 1
+; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: vcpop.m a0, v8
; CHECK-NEXT: snez a0, a0
; CHECK-NEXT: ret
entry:
@@ -181,27 +180,26 @@ define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) {
; CHECK-LABEL: widen_anyof_rdx_use_in_loop:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: li a2, 0
-; CHECK-NEXT: vsetvli a3, zero, e64, m4, ta, ma
-; CHECK-NEXT: vmclr.m v12
-; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: .LBB3_1: # %loop
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: sub a3, a1, a2
; CHECK-NEXT: slli a4, a2, 2
-; CHECK-NEXT: vsetvli a3, a3, e8, mf2, ta, ma
+; CHECK-NEXT: vsetvli a3, a3, e32, m2, ta, ma
; CHECK-NEXT: add a4, a0, a4
-; CHECK-NEXT: vle32.v v14, (a4)
-; CHECK-NEXT: vsetvli a5, zero, e32, m2, ta, ma
-; CHECK-NEXT: vmsne.vi v13, v14, 0
-; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT: vmsltu.vx v14, v8, a3
-; CHECK-NEXT: vmand.mm v13, v13, v14
-; CHECK-NEXT: vmor.mm v12, v12, v13
+; CHECK-NEXT: vle32.v v10, (a4)
+; CHECK-NEXT: vmsne.vi v0, v10, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vand.vi v9, v8, 1
+; CHECK-NEXT: vmsne.vi v9, v9, 0
; CHECK-NEXT: add a2, a2, a3
-; CHECK-NEXT: vsm.v v12, (a4)
+; CHECK-NEXT: vsm.v v9, (a4)
; CHECK-NEXT: blt a2, a1, .LBB3_1
; CHECK-NEXT: # %bb.2: # %exit
-; CHECK-NEXT: vcpop.m a0, v12
+; CHECK-NEXT: vcpop.m a0, v9
; CHECK-NEXT: snez a0, a0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
index 3555309695f26..cf5d0f107359a 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
@@ -110,13 +110,14 @@ define i1 @widen_anyof_rdx(ptr %p, i64 %n) {
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]]
; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]]
; CHECK-NEXT: [[X:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[CMP:%.*]] = icmp ne <vscale x 4 x i32> [[X]], zeroinitializer
-; CHECK-NEXT: [[TMP4]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[CMP]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[PHI]], i32 [[EVL]])
+; CHECK-NEXT: [[TMP1]] = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> [[CMP]], <vscale x 4 x i8> splat (i8 1), <vscale x 4 x i8> [[TMP0]], i32 [[EVL]])
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i1>
; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]]
; CHECK-NEXT: [[DONE:%.*]] = icmp sge i64 [[IV_NEXT]], [[N]]
@@ -154,13 +155,14 @@ define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) {
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[ENTRY]] ], [ [[REC:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]]
; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]]
; CHECK-NEXT: [[X:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
; CHECK-NEXT: [[CMP:%.*]] = icmp ne <vscale x 4 x i32> [[X]], zeroinitializer
-; CHECK-NEXT: [[REC]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[CMP]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[PHI]], i32 [[EVL]])
+; CHECK-NEXT: [[TMP1]] = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> [[CMP]], <vscale x 4 x i8> splat (i8 1), <vscale x 4 x i8> [[TMP0]], i32 [[EVL]])
+; CHECK-NEXT: [[REC:%.*]] = trunc <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i1>
; CHECK-NEXT: store <vscale x 4 x i1> [[REC]], ptr [[GEP]], align 1
; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]]
>From 678851b52c1e830238a5cee3e8bddd89a108faff Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 21 Apr 2025 18:01:47 +0800
Subject: [PATCH 3/3] Reduce num of phi uses needed to 1
Previously when we were checking for we were matching the or it was also a use on the phi
---
llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index d034d2c7270f8..8bd0d0be88c64 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -146,7 +146,7 @@ bool RISCVCodeGenPrepare::widenVPMerge(IntrinsicInst &II) {
return false;
auto *Phi = dyn_cast<PHINode>(PhiV);
- if (!Phi || Phi->getNumUses() > 2 || Phi->getNumIncomingValues() != 2 ||
+ if (!Phi || !Phi->hasOneUse() || Phi->getNumIncomingValues() != 2 ||
!match(Phi->getIncomingValue(0), m_Zero()) ||
Phi->getIncomingValue(1) != &II)
return false;
More information about the llvm-commits
mailing list