[llvm-branch-commits] [llvm] release/19.x: [LSR] Fix matching vscale immediates (#100080) (PR #100359)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jul 24 05:47:41 PDT 2024
https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/100359
Backport c1b70fa5bfea973d4141e27cf9668e9325609e19 7fad04e94b7b594389111ae7eca0883ef18dc90b
Requested by: @nikic
>From 46b7fc817edd58cff0802c8000712a0b109c0412 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 23 Jul 2024 07:30:02 +0000
Subject: [PATCH 1/2] Precommit vscale-fixups.ll test (NFC)
Precommit test for #100080.
(cherry picked from commit c1b70fa5bfea973d4141e27cf9668e9325609e19)
---
.../AArch64/vscale-fixups.ll | 47 +++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index 483955c1c57a0..56b59012eef40 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -384,4 +384,51 @@ for.exit:
ret void
}
+;; This test demonstrates an incorrect MUL VL address calculation. Here there
+;; are two writes that should be `16 * vscale * vscale` apart, however,
+;; loop-strength-reduce has ignored the second `vscale` and offset the second
+;; write by `#4, mul vl` which is an offset of `16 * vscale` dropping a vscale.
+define void @vscale_squared_offset(ptr %alloc) #0 {
+; COMMON-LABEL: vscale_squared_offset:
+; COMMON: // %bb.0: // %entry
+; COMMON-NEXT: fmov z0.s, #4.00000000
+; COMMON-NEXT: mov x8, xzr
+; COMMON-NEXT: cntw x9
+; COMMON-NEXT: fmov z1.s, #8.00000000
+; COMMON-NEXT: ptrue p0.s, vl1
+; COMMON-NEXT: cmp x8, x9
+; COMMON-NEXT: b.ge .LBB6_2
+; COMMON-NEXT: .LBB6_1: // %for.body
+; COMMON-NEXT: // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT: st1w { z0.s }, p0, [x0]
+; COMMON-NEXT: add x8, x8, #1
+; COMMON-NEXT: st1w { z1.s }, p0, [x0, #4, mul vl]
+; COMMON-NEXT: addvl x0, x0, #1
+; COMMON-NEXT: cmp x8, x9
+; COMMON-NEXT: b.lt .LBB6_1
+; COMMON-NEXT: .LBB6_2: // %for.exit
+; COMMON-NEXT: ret
+entry:
+ %vscale = call i64 @llvm.vscale.i64()
+ %c4_vscale = mul i64 %vscale, 4
+ br label %for.check
+for.check:
+ %i = phi i64 [ %next_i, %for.body ], [ 0, %entry ]
+ %is_lt = icmp slt i64 %i, %c4_vscale
+ br i1 %is_lt, label %for.body, label %for.exit
+for.body:
+ %mask = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i64(i64 0, i64 1)
+ %upper_offset = mul i64 %i, %c4_vscale
+ %upper_ptr = getelementptr float, ptr %alloc, i64 %upper_offset
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 4.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %upper_ptr, i32 4, <vscale x 4 x i1> %mask)
+ %lower_i = add i64 %i, %c4_vscale
+ %lower_offset = mul i64 %lower_i, %c4_vscale
+ %lower_ptr = getelementptr float, ptr %alloc, i64 %lower_offset
+ call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 8.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %lower_ptr, i32 4, <vscale x 4 x i1> %mask)
+ %next_i = add i64 %i, 1
+ br label %for.check
+for.exit:
+ ret void
+}
+
attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }
>From 187dcdb3472653a36f9207312abc7b9c20e712c4 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 24 Jul 2024 10:06:34 +0100
Subject: [PATCH 2/2] [LSR] Fix matching vscale immediates (#100080)
Somewhat confusingly a `SCEVMulExpr` is a `SCEVNAryExpr`, so can have
> 2 operands. Previously, the vscale immediate matching did not check
the number of operands of the `SCEVMulExpr`, so would ignore any
operands after the first two.
This led to incorrect codegen (and results) for ArmSME in IREE
(https://github.com/iree-org/iree), which sometimes addresses things
that are a `vscale * vscale` multiple away. The test added with this
change shows an example reduced from IREE. The second write should
be offset from the first `16 * vscale * vscale` (* 4 bytes), however,
previously LSR dropped the second vscale and instead offset the write by
`#4, mul vl`, which is an offset of `16 * vscale` (* 4 bytes).
(cherry picked from commit 7fad04e94b7b594389111ae7eca0883ef18dc90b)
---
.../Transforms/Scalar/LoopStrengthReduce.cpp | 6 ++++--
.../AArch64/vscale-fixups.ll | 20 +++++++++++--------
2 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 11f9f7822a15c..91461d1ed2759 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -946,13 +946,15 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
// FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
SCEV::FlagAnyWrap);
return Result;
- } else if (EnableVScaleImmediates)
- if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
+ } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
+ if (EnableVScaleImmediates && M->getNumOperands() == 2) {
if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
if (isa<SCEVVScale>(M->getOperand(1))) {
S = SE.getConstant(M->getType(), 0);
return Immediate::getScalable(C->getValue()->getSExtValue());
}
+ }
+ }
return Immediate::getZero();
}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index 56b59012eef40..588696d20227f 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -384,27 +384,31 @@ for.exit:
ret void
}
-;; This test demonstrates an incorrect MUL VL address calculation. Here there
-;; are two writes that should be `16 * vscale * vscale` apart, however,
-;; loop-strength-reduce has ignored the second `vscale` and offset the second
-;; write by `#4, mul vl` which is an offset of `16 * vscale` dropping a vscale.
+;; Here are two writes that should be `16 * vscale * vscale` apart, so MUL VL
+;; addressing cannot be used to offset the second write, as for example,
+;; `#4, mul vl` would only be an offset of `16 * vscale` (dropping a vscale).
define void @vscale_squared_offset(ptr %alloc) #0 {
; COMMON-LABEL: vscale_squared_offset:
; COMMON: // %bb.0: // %entry
+; COMMON-NEXT: rdvl x9, #1
; COMMON-NEXT: fmov z0.s, #4.00000000
; COMMON-NEXT: mov x8, xzr
-; COMMON-NEXT: cntw x9
+; COMMON-NEXT: lsr x9, x9, #4
; COMMON-NEXT: fmov z1.s, #8.00000000
+; COMMON-NEXT: cntw x10
; COMMON-NEXT: ptrue p0.s, vl1
-; COMMON-NEXT: cmp x8, x9
+; COMMON-NEXT: umull x9, w9, w9
+; COMMON-NEXT: lsl x9, x9, #6
+; COMMON-NEXT: cmp x8, x10
; COMMON-NEXT: b.ge .LBB6_2
; COMMON-NEXT: .LBB6_1: // %for.body
; COMMON-NEXT: // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT: add x11, x0, x9
; COMMON-NEXT: st1w { z0.s }, p0, [x0]
; COMMON-NEXT: add x8, x8, #1
-; COMMON-NEXT: st1w { z1.s }, p0, [x0, #4, mul vl]
+; COMMON-NEXT: st1w { z1.s }, p0, [x11]
; COMMON-NEXT: addvl x0, x0, #1
-; COMMON-NEXT: cmp x8, x9
+; COMMON-NEXT: cmp x8, x10
; COMMON-NEXT: b.lt .LBB6_1
; COMMON-NEXT: .LBB6_2: // %for.exit
; COMMON-NEXT: ret
More information about the llvm-branch-commits
mailing list