[llvm] [AArch64] Improve codegen of vectorised early exit loops (PR #119534)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 11 02:06:01 PST 2024
https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/119534
Once PR #112138 lands we are able to start vectorising more loops
that have uncountable early exits. The typical loop structure
looks like this:
vector.body:
...
%pred = icmp eq <2 x ptr> %wide.load, %broadcast.splat
...
%or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %pred)
%iv.cmp = icmp eq i64 %index.next, 4
%exit.cond = or i1 %or.reduc, %iv.cmp
br i1 %exit.cond, label %middle.split, label %vector.body
middle.split:
br i1 %or.reduc, label %found, label %notfound
found:
ret i64 1
notfound:
ret i64 0
The problem with this is that %or.reduc is kept live after the loop,
and since this is a boolean it typically requires making a copy of
the condition code register. For AArch64 this requires an additional
cset instruction, which is quite expensive for a typical find loop
that only contains 6 or 7 instructions.
This patch attempts to improve the codegen by sinking the reduction
out of the loop to the location of it's user. It's a lot cheaper to
keep the predicate alive if the type is legal and has lots of
registers for it. There is a potential downside in that a little
more work is required after the loop, but I believe this is worth
it since we are likely to spend most of our time in the loop.
>From daaf62a4c235184b0f3821ba1ea839092d6f6d89 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 11 Dec 2024 09:57:08 +0000
Subject: [PATCH 1/2] Add tests
---
llvm/test/CodeGen/AArch64/reduce-or-opt.ll | 195 +++++++++++++++++++++
1 file changed, 195 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/reduce-or-opt.ll
diff --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
new file mode 100644
index 00000000000000..cff6cae9b5c32a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
@@ -0,0 +1,195 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: select_or_reduce_v2i1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: .LBB0_1: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr q0, [x0, x8]
+; CHECK-NEXT: cmeq v0.2d, v0.2d, #0
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: tbnz w9, #0, .LBB0_3
+; CHECK-NEXT: // %bb.2: // %vector.body
+; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: cmp x8, #16
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: .LBB0_3: // %middle.split
+; CHECK-NEXT: and x0, x9, #0x1
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+ %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+ %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+ %index.next = add nuw i64 %index, 2
+ %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+ %iv.cmp = icmp eq i64 %index.next, 4
+ %exit.cond = or i1 %or.reduc, %iv.cmp
+ br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+ %sel = select i1 %or.reduc, i64 1, i64 0
+ ret i64 %sel
+}
+
+define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: br_or_reduce_v2i1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: .LBB1_1: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr q0, [x0, x8]
+; CHECK-NEXT: cmeq v0.2d, v0.2d, #0
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: tbnz w9, #0, .LBB1_3
+; CHECK-NEXT: // %bb.2: // %vector.body
+; CHECK-NEXT: // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: cmp x8, #16
+; CHECK-NEXT: add x8, x8, #16
+; CHECK-NEXT: b.ne .LBB1_1
+; CHECK-NEXT: .LBB1_3: // %middle.split
+; CHECK-NEXT: tbz w9, #0, .LBB1_5
+; CHECK-NEXT: // %bb.4: // %found
+; CHECK-NEXT: mov w8, #56 // =0x38
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: str x8, [x1]
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: ret
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+ %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+ %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+ %index.next = add nuw i64 %index, 2
+ %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+ %iv.cmp = icmp eq i64 %index.next, 4
+ %exit.cond = or i1 %or.reduc, %iv.cmp
+ br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+ br i1 %or.reduc, label %found, label %notfound
+
+found:
+ store i64 56, ptr %p, align 8
+ ret i64 1
+
+notfound:
+ ret i64 0
+}
+
+define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: select_or_reduce_nxv2i1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov x10, xzr
+; CHECK-NEXT: neg x8, x9
+; CHECK-NEXT: add x11, x8, #4
+; CHECK-NEXT: .LBB2_1: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x10, lsl #3]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT: cset w8, ne
+; CHECK-NEXT: b.ne .LBB2_3
+; CHECK-NEXT: // %bb.2: // %vector.body
+; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT: cmp x11, x10
+; CHECK-NEXT: add x10, x10, x9
+; CHECK-NEXT: b.ne .LBB2_1
+; CHECK-NEXT: .LBB2_3: // %middle.split
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
+entry:
+ %vscale = tail call i64 @llvm.vscale.i64()
+ %vf = shl nuw nsw i64 %vscale, 1
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+ %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+ %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+ %index.next = add nuw i64 %index, %vf
+ %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+ %iv.cmp = icmp eq i64 %index.next, 4
+ %exit.cond = or i1 %or.reduc, %iv.cmp
+ br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+ %sel = select i1 %or.reduc, i64 1, i64 0
+ ret i64 %sel
+}
+
+define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: br_or_reduce_nxv2i1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cntd x8
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: neg x10, x8
+; CHECK-NEXT: add x10, x10, #4
+; CHECK-NEXT: .LBB3_1: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT: cset w11, ne
+; CHECK-NEXT: b.ne .LBB3_3
+; CHECK-NEXT: // %bb.2: // %vector.body
+; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT: cmp x10, x9
+; CHECK-NEXT: add x9, x9, x8
+; CHECK-NEXT: b.ne .LBB3_1
+; CHECK-NEXT: .LBB3_3: // %middle.split
+; CHECK-NEXT: tbz w11, #0, .LBB3_5
+; CHECK-NEXT: // %bb.4: // %found
+; CHECK-NEXT: mov w8, #56 // =0x38
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: str x8, [x1]
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB3_5:
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: ret
+entry:
+ %vscale = tail call i64 @llvm.vscale.i64()
+ %vf = shl nuw nsw i64 %vscale, 1
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+ %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+ %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+ %index.next = add nuw i64 %index, %vf
+ %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+ %iv.cmp = icmp eq i64 %index.next, 4
+ %exit.cond = or i1 %or.reduc, %iv.cmp
+ br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+ br i1 %or.reduc, label %found, label %notfound
+
+found:
+ store i64 56, ptr %p, align 8
+ ret i64 1
+
+notfound:
+ ret i64 0
+}
+
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)
>From c01a65ffb365079658128e1f0b341f5b6665ac86 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 11 Dec 2024 09:58:07 +0000
Subject: [PATCH 2/2] [AArch64] Improve codegen of vectorised early exit loops
Once PR #112138 lands we are able to start vectorising more loops
that have uncountable early exits. The typical loop structure
looks like this:
vector.body:
...
%pred = icmp eq <2 x ptr> %wide.load, %broadcast.splat
...
%or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %pred)
%iv.cmp = icmp eq i64 %index.next, 4
%exit.cond = or i1 %or.reduc, %iv.cmp
br i1 %exit.cond, label %middle.split, label %vector.body
middle.split:
br i1 %or.reduc, label %found, label %notfound
found:
ret i64 1
notfound:
ret i64 0
The problem with this is that %or.reduc is kept live after the loop,
and since this is a boolean it typically requires making a copy of
the condition code register. For AArch64 this requires an additional
cset instruction, which is quite expensive for a typical find loop
that only contains 6 or 7 instructions.
This patch attempts to improve the codegen by sinking the reduction
out of the loop to the location of it's user. It's a lot cheaper to
keep the predicate alive if the type is legal and has lots of
registers for it. There is a potential downside in that a little
more work is required after the loop, but I believe this is worth
it since we are likely to spend most of our time in the loop.
---
.../AArch64/AArch64TargetTransformInfo.cpp | 25 ++++++++++++++++++-
llvm/test/CodeGen/AArch64/reduce-or-opt.ll | 22 ++++++++--------
2 files changed, 35 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b333d33cffd52..e1c7f5a2c846a0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5092,11 +5092,17 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
}
}
- // Sink vscales closer to uses for better isel
+ auto ShouldSinkCondition = [] (Value *Cond) -> bool {
+ auto *II = dyn_cast<IntrinsicInst>(Cond);
+ return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
+ isa<ScalableVectorType>(II->getOperand(0)->getType());
+ };
+
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
case Instruction::Add:
case Instruction::Sub:
+ // Sink vscales closer to uses for better isel
for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
if (shouldSinkVScale(I->getOperand(Op), Ops)) {
Ops.push_back(&I->getOperandUse(Op));
@@ -5104,6 +5110,23 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
}
}
break;
+ case Instruction::Select: {
+ if (!ShouldSinkCondition(I->getOperand(0)))
+ return false;
+
+ Ops.push_back(&I->getOperandUse(0));
+ return true;
+ }
+ case Instruction::Br: {
+ if (cast<BranchInst>(I)->isUnconditional())
+ return false;
+
+ if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
+ return false;
+
+ Ops.push_back(&I->getOperandUse(0));
+ return true;
+ }
default:
break;
}
diff --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
index cff6cae9b5c32a..8c17426a66a083 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
@@ -95,24 +95,24 @@ notfound:
define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
; CHECK-LABEL: select_or_reduce_nxv2i1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cntd x9
+; CHECK-NEXT: cntd x8
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: mov x10, xzr
-; CHECK-NEXT: neg x8, x9
-; CHECK-NEXT: add x11, x8, #4
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: neg x10, x8
+; CHECK-NEXT: add x10, x10, #4
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x10, lsl #3]
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; CHECK-NEXT: cset w8, ne
; CHECK-NEXT: b.ne .LBB2_3
; CHECK-NEXT: // %bb.2: // %vector.body
; CHECK-NEXT: // in Loop: Header=BB2_1 Depth=1
-; CHECK-NEXT: cmp x11, x10
-; CHECK-NEXT: add x10, x10, x9
+; CHECK-NEXT: cmp x10, x9
+; CHECK-NEXT: add x9, x9, x8
; CHECK-NEXT: b.ne .LBB2_1
; CHECK-NEXT: .LBB2_3: // %middle.split
-; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ptest p0, p1.b
+; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
entry:
%vscale = tail call i64 @llvm.vscale.i64()
@@ -147,7 +147,6 @@ define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; CHECK-NEXT: cset w11, ne
; CHECK-NEXT: b.ne .LBB3_3
; CHECK-NEXT: // %bb.2: // %vector.body
; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1
@@ -155,7 +154,8 @@ define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef
; CHECK-NEXT: add x9, x9, x8
; CHECK-NEXT: b.ne .LBB3_1
; CHECK-NEXT: .LBB3_3: // %middle.split
-; CHECK-NEXT: tbz w11, #0, .LBB3_5
+; CHECK-NEXT: ptest p0, p1.b
+; CHECK-NEXT: b.eq .LBB3_5
; CHECK-NEXT: // %bb.4: // %found
; CHECK-NEXT: mov w8, #56 // =0x38
; CHECK-NEXT: mov w0, #1 // =0x1
More information about the llvm-commits
mailing list