[llvm] [LSR] Recognize vscale-relative immediates (PR #88124)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Wed May 15 06:30:12 PDT 2024
https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/88124
>From 37f79b4af2ee4c05cd58fa5c337ceeeb16f86851 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 14 May 2024 13:08:52 +0000
Subject: [PATCH 1/3] Test cases for vscale immediates
---
.../AArch64/vscale-fixups.ll | 146 ++++++++++++++++++
1 file changed, 146 insertions(+)
create mode 100644 llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
new file mode 100644
index 0000000000000..4652d0d83919c
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+
+;;target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @mulvl123_addressing(ptr %src, ptr %dst, i64 %count) #0 {
+; CHECK-LABEL: define void @mulvl123_addressing(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[COUNT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[VSCALE]], 4
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw i64 [[VSCALE]], 48
+; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[VSCALE]], 80
+; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[VSCALE]], 5
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[SRC]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = load <vscale x 16 x i8>, ptr [[LSR_IV]], align 16
+; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP6:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP3]], align 16
+; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP2]], align 16
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP8:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP1]], align 16
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]])
+; CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]])
+; CHECK-NEXT: [[TMP11:%.*]] = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP10]])
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IDX]]
+; CHECK-NEXT: store <vscale x 16 x i8> [[TMP11]], ptr [[ARRAYIDX4]], align 16
+; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], [[TMP0]]
+; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP1]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX_NEXT]], [[COUNT]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_EXIT:%.*]]
+; CHECK: for.exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ %vscale = tail call i64 @llvm.vscale.i64()
+ %2 = shl nuw nsw i64 %vscale, 4
+ %mul = shl nuw nsw i64 %vscale, 6
+ br label %for.body
+
+for.body:
+ %src.addr = phi ptr [ %src, %entry ], [ %src.addr.next, %for.body ]
+ %idx = phi i64 [ 0, %entry ], [ %idx.next, %for.body ]
+ %arrayidx = getelementptr inbounds i8, ptr %src.addr, i64 %idx
+ %3 = load <vscale x 16 x i8>, ptr %arrayidx
+ %4 = getelementptr <vscale x 16 x i8>, ptr %arrayidx, i64 1
+ %5 = load <vscale x 16 x i8>, ptr %4
+ %6 = getelementptr <vscale x 16 x i8>, ptr %arrayidx, i64 2
+ %7 = load <vscale x 16 x i8>, ptr %6
+ %8 = getelementptr <vscale x 16 x i8>, ptr %arrayidx, i64 3
+ %9 = load <vscale x 16 x i8>, ptr %8
+ %10 = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> %3, <vscale x 16 x i8> %5)
+ %11 = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> %7, <vscale x 16 x i8> %9)
+ %12 = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> %10, <vscale x 16 x i8> %11)
+ %src.addr.next = getelementptr inbounds i8, ptr %src.addr, i64 %mul
+ %arrayidx4 = getelementptr inbounds i8, ptr %dst, i64 %idx
+ store <vscale x 16 x i8> %12, ptr %arrayidx4
+ %idx.next = add i64 %idx, %2
+ %cmp = icmp ult i64 %idx.next, %count
+ br i1 %cmp, label %for.body, label %for.exit
+
+for.exit:
+ ret void
+}
+
+define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i64 %count) #0 {
+; CHECK-LABEL: define void @many_mulvl1_addressing(
+; CHECK-SAME: ptr [[SRC_ROWS:%.*]], ptr [[DST_ROWS:%.*]], i64 [[STRIDE:%.*]], i64 [[COUNT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[MUL:%.*]] = shl i64 [[VSCALE]], 5
+; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[VSCALE]], 4
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC_ROWS]], i64 [[STRIDE]]
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[TMP0]]
+; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[SRC_ROWS]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[VSCALE]], 3
+; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[DST_ROWS]], i64 [[TMP1]]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[LSR_IV10:%.*]] = phi i64 [ [[LSR_IV_NEXT11:%.*]], [[FOR_BODY]] ], [ [[COUNT]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[SRC_ROWS]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[DST_ROWS]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP6]], align 16
+; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[TMP3:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP5]], align 16
+; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[TMP4:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP3]], align 16
+; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[TMP5:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP2]], align 16
+; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 16 x i8> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 16 x i8> [[TMP3]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <vscale x 16 x i8> [[TMP6]] to <vscale x 8 x i16>
+; CHECK-NEXT: [[TMP9:%.*]] = trunc <vscale x 8 x i16> [[TMP8]] to <vscale x 8 x i8>
+; CHECK-NEXT: store <vscale x 8 x i8> [[TMP9]], ptr [[SCEVGEP9]], align 8
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x i16>
+; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[SCEVGEP7]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = trunc <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x i8>
+; CHECK-NEXT: store <vscale x 8 x i8> [[TMP11]], ptr [[SCEVGEP8]], align 8
+; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], [[MUL]]
+; CHECK-NEXT: [[LSR_IV_NEXT11]] = add i64 [[LSR_IV10]], -1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[LSR_IV_NEXT11]], 0
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+; CHECK: for.exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ %vscale = tail call i64 @llvm.vscale.i64()
+ %mul = shl nuw nsw i64 %vscale, 5
+ br label %for.body
+
+for.body:
+ %src_row_addr = phi ptr [ %src_rows, %entry ], [ %add_ptr_src, %for.body ]
+ %dst_row_addr = phi ptr [ %dst_rows, %entry ], [ %add_ptr_dst, %for.body ]
+ %idx = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %2 = load <vscale x 16 x i8>, ptr %src_row_addr
+ %3 = getelementptr <vscale x 16 x i8>, ptr %src_row_addr, i64 1
+ %4 = load <vscale x 16 x i8>, ptr %3
+ %arrayidx2 = getelementptr inbounds i8, ptr %src_row_addr, i64 %stride
+ %5 = load <vscale x 16 x i8>, ptr %arrayidx2
+ %6 = getelementptr <vscale x 16 x i8>, ptr %arrayidx2, i64 1
+ %7 = load <vscale x 16 x i8>, ptr %6
+ %8 = add <vscale x 16 x i8> %2, %5
+ %9 = add <vscale x 16 x i8> %4, %7
+ %10 = bitcast <vscale x 16 x i8> %8 to <vscale x 8 x i16>
+ %11 = trunc <vscale x 8 x i16> %10 to <vscale x 8 x i8>
+ store <vscale x 8 x i8> %11, ptr %dst_row_addr
+ %12 = bitcast <vscale x 16 x i8> %9 to <vscale x 8 x i16>
+ %13 = getelementptr <vscale x 8 x i8>, ptr %dst_row_addr, i64 1
+ %14 = trunc <vscale x 8 x i16> %12 to <vscale x 8 x i8>
+ store <vscale x 8 x i8> %14, ptr %13
+ %add_ptr_src = getelementptr inbounds i8, ptr %src_row_addr, i64 %mul
+ %add_ptr_dst = getelementptr inbounds i8, ptr %dst_row_addr, i64 %mul
+ %inc = add nuw i64 %idx, 1
+ %exitcond = icmp eq i64 %inc, %count
+ br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+ ret void
+}
+
+attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }
>From a4ca046380d748dafc3d7e8a0480eff30727f308 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 15 May 2024 11:06:24 +0000
Subject: [PATCH 2/3] Convert LSR to use possibly-scalable Immediate type
---
.../Transforms/Scalar/LoopStrengthReduce.cpp | 411 +++++++++++-------
1 file changed, 256 insertions(+), 155 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 35a17d6060c94..00beec1a0637f 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -247,6 +247,68 @@ class RegSortData {
void dump() const;
};
+// An offset from an address that is either scalable or fixed. Used for
+// per-target optimizations of addressing modes.
+class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
+ constexpr Immediate(ScalarTy MinVal, bool Scalable)
+ : FixedOrScalableQuantity(MinVal, Scalable) {}
+
+ constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
+ : FixedOrScalableQuantity(V) {}
+
+public:
+ constexpr Immediate() : FixedOrScalableQuantity() {}
+
+ static constexpr Immediate getFixed(ScalarTy MinVal) {
+ return Immediate(MinVal, false);
+ }
+ static constexpr Immediate getScalable(ScalarTy MinVal) {
+ return Immediate(MinVal, true);
+ }
+ static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
+ return Immediate(MinVal, Scalable);
+ }
+
+ constexpr bool isLessThanZero() const { return Quantity < 0; }
+
+ constexpr bool isGreaterThanZero() const { return Quantity > 0; }
+
+ constexpr bool isMin() const {
+ return Quantity == std::numeric_limits<ScalarTy>::min();
+ }
+
+ constexpr bool isMax() const {
+ return Quantity == std::numeric_limits<ScalarTy>::max();
+ }
+};
+
+// This is needed for the Compare type of std::map when Immediate is used
+// as a key. We don't need it to be fully correct against any value of vscale,
+// just to make sure that vscale-related terms in the map are considered against
+// each other rather than being mixed up and potentially missing opportunities.
+struct KeyOrderTargetImmediate {
+ bool operator()(const Immediate &LHS, const Immediate &RHS) const {
+ if (LHS.isScalable() && !RHS.isScalable())
+ return false;
+ if (!LHS.isScalable() && RHS.isScalable())
+ return true;
+ return LHS.getKnownMinValue() < RHS.getKnownMinValue();
+ }
+};
+
+// This would be nicer if we could be generic instead of directly using size_t,
+// but there doesn't seem to be a type trait for is_orderable or
+// is_lessthan_comparable or similar.
+struct KeyOrderSizeTAndImmediate {
+ bool operator()(const std::pair<size_t, Immediate> &LHS,
+ const std::pair<size_t, Immediate> &RHS) const {
+ size_t LSize = LHS.first;
+ size_t RSize = RHS.first;
+ if (LSize != RSize)
+ return LSize < RSize;
+ return KeyOrderTargetImmediate()(LHS.second, RHS.second);
+ }
+};
} // end anonymous namespace
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -357,7 +419,7 @@ struct Formula {
GlobalValue *BaseGV = nullptr;
/// Base offset for complex addressing.
- int64_t BaseOffset = 0;
+ Immediate BaseOffset;
/// Whether any complex addressing has a base register.
bool HasBaseReg = false;
@@ -388,7 +450,7 @@ struct Formula {
/// An additional constant offset which added near the use. This requires a
/// temporary register, but the offset itself can live in an add immediate
/// field rather than a register.
- int64_t UnfoldedOffset = 0;
+ Immediate UnfoldedOffset;
Formula() = default;
@@ -628,7 +690,7 @@ void Formula::print(raw_ostream &OS) const {
if (!First) OS << " + "; else First = false;
BaseGV->printAsOperand(OS, /*PrintType=*/false);
}
- if (BaseOffset != 0) {
+ if (BaseOffset.isNonZero()) {
if (!First) OS << " + "; else First = false;
OS << BaseOffset;
}
@@ -652,7 +714,7 @@ void Formula::print(raw_ostream &OS) const {
OS << "<unknown>";
OS << ')';
}
- if (UnfoldedOffset != 0) {
+ if (UnfoldedOffset.isNonZero()) {
if (!First) OS << " + ";
OS << "imm(" << UnfoldedOffset << ')';
}
@@ -798,28 +860,28 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
/// If S involves the addition of a constant integer value, return that integer
/// value, and mutate S to point to a new SCEV with that value excluded.
-static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
+static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
if (C->getAPInt().getSignificantBits() <= 64) {
S = SE.getConstant(C->getType(), 0);
- return C->getValue()->getSExtValue();
+ return Immediate::getFixed(C->getValue()->getSExtValue());
}
} else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
SmallVector<const SCEV *, 8> NewOps(Add->operands());
- int64_t Result = ExtractImmediate(NewOps.front(), SE);
- if (Result != 0)
+ Immediate Result = ExtractImmediate(NewOps.front(), SE);
+ if (Result.isNonZero())
S = SE.getAddExpr(NewOps);
return Result;
} else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
SmallVector<const SCEV *, 8> NewOps(AR->operands());
- int64_t Result = ExtractImmediate(NewOps.front(), SE);
- if (Result != 0)
+ Immediate Result = ExtractImmediate(NewOps.front(), SE);
+ if (Result.isNonZero())
S = SE.getAddRecExpr(NewOps, AR->getLoop(),
// FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
SCEV::FlagAnyWrap);
return Result;
}
- return 0;
+ return Immediate();
}
/// If S involves the addition of a GlobalValue address, return that symbol, and
@@ -1134,7 +1196,7 @@ struct LSRFixup {
/// A constant offset to be added to the LSRUse expression. This allows
/// multiple fixups to share the same LSRUse with different offsets, for
/// example in an unrolled loop.
- int64_t Offset = 0;
+ Immediate Offset;
LSRFixup() = default;
@@ -1197,8 +1259,10 @@ class LSRUse {
SmallVector<LSRFixup, 8> Fixups;
/// Keep track of the min and max offsets of the fixups.
- int64_t MinOffset = std::numeric_limits<int64_t>::max();
- int64_t MaxOffset = std::numeric_limits<int64_t>::min();
+ Immediate MinOffset =
+ Immediate::getFixed(std::numeric_limits<int64_t>::max());
+ Immediate MaxOffset =
+ Immediate::getFixed(std::numeric_limits<int64_t>::min());
/// This records whether all of the fixups using this LSRUse are outside of
/// the loop, in which case some special-case heuristics may be used.
@@ -1234,9 +1298,9 @@ class LSRUse {
void pushFixup(LSRFixup &f) {
Fixups.push_back(f);
- if (f.Offset > MaxOffset)
+ if (Immediate::isKnownGT(f.Offset, MaxOffset))
MaxOffset = f.Offset;
- if (f.Offset < MinOffset)
+ if (Immediate::isKnownLT(f.Offset, MinOffset))
MinOffset = f.Offset;
}
@@ -1254,7 +1318,7 @@ class LSRUse {
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
LSRUse::KindType Kind, MemAccessTy AccessTy,
- GlobalValue *BaseGV, int64_t BaseOffset,
+ GlobalValue *BaseGV, Immediate BaseOffset,
bool HasBaseReg, int64_t Scale,
Instruction *Fixup = nullptr);
@@ -1310,7 +1374,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
// addressing.
if (AMK == TTI::AMK_PreIndexed) {
if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
- if (Step->getAPInt() == F.BaseOffset)
+ if (Step->getAPInt() == F.BaseOffset.getFixedValue())
LoopCost = 0;
} else if (AMK == TTI::AMK_PostIndexed) {
const SCEV *LoopStep = AR->getStepRecurrence(*SE);
@@ -1401,24 +1465,25 @@ void Cost::RateFormula(const Formula &F,
// allows to fold 2 registers.
C.NumBaseAdds +=
NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
- C.NumBaseAdds += (F.UnfoldedOffset != 0);
+ C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
// Accumulate non-free scaling amounts.
C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue();
// Tally up the non-zero immediates.
for (const LSRFixup &Fixup : LU.Fixups) {
- int64_t O = Fixup.Offset;
- int64_t Offset = (uint64_t)O + F.BaseOffset;
+ Immediate O = Fixup.Offset;
+ Immediate Offset = Immediate::getFixed((uint64_t)O.getFixedValue() +
+ F.BaseOffset.getFixedValue());
if (F.BaseGV)
C.ImmCost += 64; // Handle symbolic values conservatively.
// TODO: This should probably be the pointer size.
- else if (Offset != 0)
- C.ImmCost += APInt(64, Offset, true).getSignificantBits();
+ else if (Offset.isNonZero())
+ C.ImmCost += APInt(64, Offset.getFixedValue(), true).getSignificantBits();
// Check with target if this offset with this instruction is
// specifically not supported.
- if (LU.Kind == LSRUse::Address && Offset != 0 &&
+ if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
!isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
C.NumBaseAdds++;
@@ -1546,7 +1611,7 @@ void LSRFixup::print(raw_ostream &OS) const {
PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
}
- if (Offset != 0)
+ if (Offset.isNonZero())
OS << ", Offset=" << Offset;
}
@@ -1673,13 +1738,14 @@ LLVM_DUMP_METHOD void LSRUse::dump() const {
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
LSRUse::KindType Kind, MemAccessTy AccessTy,
- GlobalValue *BaseGV, int64_t BaseOffset,
+ GlobalValue *BaseGV, Immediate BaseOffset,
bool HasBaseReg, int64_t Scale,
- Instruction *Fixup/*= nullptr*/) {
+ Instruction *Fixup /*= nullptr*/) {
switch (Kind) {
case LSRUse::Address:
- return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
- HasBaseReg, Scale, AccessTy.AddrSpace, Fixup);
+ return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV,
+ BaseOffset.getFixedValue(), HasBaseReg,
+ Scale, AccessTy.AddrSpace, Fixup);
case LSRUse::ICmpZero:
// There's not even a target hook for querying whether it would be legal to
@@ -1688,7 +1754,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
return false;
// ICmp only has two operands; don't allow more than two non-trivial parts.
- if (Scale != 0 && HasBaseReg && BaseOffset != 0)
+ if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
return false;
// ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
@@ -1698,7 +1764,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
// If we have low-level target information, ask the target if it can fold an
// integer immediate on an icmp.
- if (BaseOffset != 0) {
+ if (BaseOffset.isNonZero()) {
// We have one of:
// ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
// ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
@@ -1706,8 +1772,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
if (Scale == 0)
// The cast does the right thing with
// std::numeric_limits<int64_t>::min().
- BaseOffset = -(uint64_t)BaseOffset;
- return TTI.isLegalICmpImmediate(BaseOffset);
+ BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
+ return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
}
// ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
@@ -1715,30 +1781,34 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
case LSRUse::Basic:
// Only handle single-register values.
- return !BaseGV && Scale == 0 && BaseOffset == 0;
+ return !BaseGV && Scale == 0 && BaseOffset.isZero();
case LSRUse::Special:
// Special case Basic to handle -1 scales.
- return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
+ return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
}
llvm_unreachable("Invalid LSRUse Kind!");
}
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
- int64_t MinOffset, int64_t MaxOffset,
+ Immediate MinOffset, Immediate MaxOffset,
LSRUse::KindType Kind, MemAccessTy AccessTy,
- GlobalValue *BaseGV, int64_t BaseOffset,
+ GlobalValue *BaseGV, Immediate BaseOffset,
bool HasBaseReg, int64_t Scale) {
+ int64_t Base = BaseOffset.getFixedValue();
+ int64_t Min = MinOffset.getFixedValue();
+ int64_t Max = MaxOffset.getFixedValue();
// Check for overflow.
- if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
- (MinOffset > 0))
+ if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
return false;
- MinOffset = (uint64_t)BaseOffset + MinOffset;
- if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
- (MaxOffset > 0))
+ Min = (uint64_t)Base + Min;
+ if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
return false;
- MaxOffset = (uint64_t)BaseOffset + MaxOffset;
+ Max = (uint64_t)Base + Max;
+
+ MinOffset = Immediate::getFixed(Min);
+ MaxOffset = Immediate::getFixed(Max);
return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
HasBaseReg, Scale) &&
@@ -1747,7 +1817,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
}
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
- int64_t MinOffset, int64_t MaxOffset,
+ Immediate MinOffset, Immediate MaxOffset,
LSRUse::KindType Kind, MemAccessTy AccessTy,
const Formula &F, const Loop &L) {
// For the purpose of isAMCompletelyFolded either having a canonical formula
@@ -1763,10 +1833,10 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
}
/// Test whether we know how to expand the current formula.
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
- int64_t MaxOffset, LSRUse::KindType Kind,
+static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
+ Immediate MaxOffset, LSRUse::KindType Kind,
MemAccessTy AccessTy, GlobalValue *BaseGV,
- int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
+ Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
// We know how to expand completely foldable formulae.
return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
BaseOffset, HasBaseReg, Scale) ||
@@ -1777,8 +1847,8 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
BaseGV, BaseOffset, true, 0));
}
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
- int64_t MaxOffset, LSRUse::KindType Kind,
+static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
+ Immediate MaxOffset, LSRUse::KindType Kind,
MemAccessTy AccessTy, const Formula &F) {
return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
F.BaseOffset, F.HasBaseReg, F.Scale);
@@ -1815,15 +1885,15 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
switch (LU.Kind) {
case LSRUse::Address: {
+ int64_t FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
+ int64_t FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
// Check the scaling factor cost with both the min and max offsets.
InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
- LU.AccessTy.MemTy, F.BaseGV,
- StackOffset::getFixed(F.BaseOffset + LU.MinOffset), F.HasBaseReg,
- F.Scale, LU.AccessTy.AddrSpace);
+ LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMin),
+ F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
- LU.AccessTy.MemTy, F.BaseGV,
- StackOffset::getFixed(F.BaseOffset + LU.MaxOffset), F.HasBaseReg,
- F.Scale, LU.AccessTy.AddrSpace);
+ LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMax),
+ F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
"Legal addressing mode has an illegal cost!");
@@ -1842,10 +1912,11 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
LSRUse::KindType Kind, MemAccessTy AccessTy,
- GlobalValue *BaseGV, int64_t BaseOffset,
+ GlobalValue *BaseGV, Immediate BaseOffset,
bool HasBaseReg) {
// Fast-path: zero is always foldable.
- if (BaseOffset == 0 && !BaseGV) return true;
+ if (BaseOffset.isZero() && !BaseGV)
+ return true;
// Conservatively, create an address with an immediate and a
// base and a scale.
@@ -1863,8 +1934,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
}
static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
- ScalarEvolution &SE, int64_t MinOffset,
- int64_t MaxOffset, LSRUse::KindType Kind,
+ ScalarEvolution &SE, Immediate MinOffset,
+ Immediate MaxOffset, LSRUse::KindType Kind,
MemAccessTy AccessTy, const SCEV *S,
bool HasBaseReg) {
// Fast-path: zero is always foldable.
@@ -1872,14 +1943,15 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
// Conservatively, create an address with an immediate and a
// base and a scale.
- int64_t BaseOffset = ExtractImmediate(S, SE);
+ Immediate BaseOffset = ExtractImmediate(S, SE);
GlobalValue *BaseGV = ExtractSymbol(S, SE);
// If there's anything else involved, it's not foldable.
if (!S->isZero()) return false;
// Fast-path: zero is always foldable.
- if (BaseOffset == 0 && !BaseGV) return true;
+ if (BaseOffset.isZero() && !BaseGV)
+ return true;
// Conservatively, create an address with an immediate and a
// base and a scale.
@@ -2028,11 +2100,11 @@ class LSRInstance {
using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
UseMapTy UseMap;
- bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
+ bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
LSRUse::KindType Kind, MemAccessTy AccessTy);
- std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
- MemAccessTy AccessTy);
+ std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
+ MemAccessTy AccessTy);
void DeleteUse(LSRUse &LU, size_t LUIdx);
@@ -2058,7 +2130,7 @@ class LSRInstance {
void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
const Formula &Base,
- const SmallVectorImpl<int64_t> &Worklist,
+ const SmallVectorImpl<Immediate> &Worklist,
size_t Idx, bool IsScaledReg = false);
void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
@@ -2566,11 +2638,11 @@ LSRInstance::OptimizeLoopTermCond() {
/// Determine if the given use can accommodate a fixup at the given offset and
/// other details. If so, update the use and return true.
-bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
+bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
bool HasBaseReg, LSRUse::KindType Kind,
MemAccessTy AccessTy) {
- int64_t NewMinOffset = LU.MinOffset;
- int64_t NewMaxOffset = LU.MaxOffset;
+ Immediate NewMinOffset = LU.MinOffset;
+ Immediate NewMaxOffset = LU.MaxOffset;
MemAccessTy NewAccessTy = AccessTy;
// Check for a mismatched kind. It's tempting to collapse mismatched kinds to
@@ -2590,12 +2662,12 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
}
// Conservatively assume HasBaseReg is true for now.
- if (NewOffset < LU.MinOffset) {
+ if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
LU.MaxOffset - NewOffset, HasBaseReg))
return false;
NewMinOffset = NewOffset;
- } else if (NewOffset > LU.MaxOffset) {
+ } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
NewOffset - LU.MinOffset, HasBaseReg))
return false;
@@ -2612,17 +2684,17 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
/// Return an LSRUse index and an offset value for a fixup which needs the given
/// expression, with the given kind and optional access type. Either reuse an
/// existing use or create a new one, as needed.
-std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
- LSRUse::KindType Kind,
- MemAccessTy AccessTy) {
+std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
+ LSRUse::KindType Kind,
+ MemAccessTy AccessTy) {
const SCEV *Copy = Expr;
- int64_t Offset = ExtractImmediate(Expr, SE);
+ Immediate Offset = ExtractImmediate(Expr, SE);
// Basic uses can't accept any offset, for example.
if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
Offset, /*HasBaseReg=*/ true)) {
Expr = Copy;
- Offset = 0;
+ Offset = Immediate::getFixed(0);
}
std::pair<UseMapTy::iterator, bool> P =
@@ -2683,7 +2755,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
F.BaseGV == OrigF.BaseGV &&
F.Scale == OrigF.Scale &&
F.UnfoldedOffset == OrigF.UnfoldedOffset) {
- if (F.BaseOffset == 0)
+ if (F.BaseOffset.isZero())
return &LU;
// This is the formula where all the registers and symbols matched;
// there aren't going to be any others. Since we declined it, we
@@ -3174,7 +3246,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
int64_t IncOffset = IncConst->getValue()->getSExtValue();
if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
- IncOffset, /*HasBaseReg=*/false))
+ Immediate::getFixed(IncOffset), /*HasBaseReg=*/false))
return false;
return true;
@@ -3380,9 +3452,9 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
}
// Get or create an LSRUse.
- std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
+ std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
size_t LUIdx = P.first;
- int64_t Offset = P.second;
+ Immediate Offset = P.second;
LSRUse &LU = Uses[LUIdx];
// Record the fixup.
@@ -3572,10 +3644,10 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
continue;
}
- std::pair<size_t, int64_t> P = getUse(
- S, LSRUse::Basic, MemAccessTy());
+ std::pair<size_t, Immediate> P =
+ getUse(S, LSRUse::Basic, MemAccessTy());
size_t LUIdx = P.first;
- int64_t Offset = P.second;
+ Immediate Offset = P.second;
LSRUse &LU = Uses[LUIdx];
LSRFixup &LF = LU.getNewFixup();
LF.UserInst = const_cast<Instruction *>(UserInst);
@@ -3734,10 +3806,11 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
// Add the remaining pieces of the add back into the new formula.
const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
- TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+ TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
InnerSumSC->getValue()->getZExtValue())) {
F.UnfoldedOffset =
- (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
+ Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
+ InnerSumSC->getValue()->getZExtValue());
if (IsScaledReg)
F.ScaledReg = nullptr;
else
@@ -3750,10 +3823,11 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
// Add J as its own register, or an unfolded immediate.
const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
- TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+ TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
SC->getValue()->getZExtValue()))
F.UnfoldedOffset =
- (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
+ Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
+ SC->getValue()->getZExtValue());
else
F.BaseRegs.push_back(*J);
// We may have changed the number of register in base regs, adjust the
@@ -3794,7 +3868,8 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
Formula Base) {
// This method is only interesting on a plurality of registers.
if (Base.BaseRegs.size() + (Base.Scale == 1) +
- (Base.UnfoldedOffset != 0) <= 1)
+ (Base.UnfoldedOffset.isNonZero()) <=
+ 1)
return;
// Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
@@ -3845,9 +3920,9 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
// registers collected.
if (NewBase.UnfoldedOffset) {
assert(CombinedIntegerType && "Missing a type for the unfolded offset");
- Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
- true));
- NewBase.UnfoldedOffset = 0;
+ Ops.push_back(SE.getConstant(CombinedIntegerType,
+ NewBase.UnfoldedOffset.getFixedValue(), true));
+ NewBase.UnfoldedOffset = Immediate::getFixed(0);
GenerateFormula(SE.getAddExpr(Ops));
}
}
@@ -3887,15 +3962,17 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
/// Helper function for LSRInstance::GenerateConstantOffsets.
void LSRInstance::GenerateConstantOffsetsImpl(
LSRUse &LU, unsigned LUIdx, const Formula &Base,
- const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
+ const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
- auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
+ auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
Formula F = Base;
- F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
+ F.BaseOffset = Immediate::getFixed(
+ (uint64_t)Base.BaseOffset.getFixedValue() - Offset.getFixedValue());
if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
// Add the offset to the base register.
- const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
+ const SCEV *NewG = SE.getAddExpr(
+ SE.getConstant(G->getType(), Offset.getFixedValue()), G);
// If it cancelled out, drop the base register, otherwise update it.
if (NewG->isZero()) {
if (IsScaledReg) {
@@ -3931,21 +4008,22 @@ void LSRInstance::GenerateConstantOffsetsImpl(
int64_t Step = StepInt.isNegative() ?
StepInt.getSExtValue() : StepInt.getZExtValue();
- for (int64_t Offset : Worklist) {
- Offset -= Step;
+ for (Immediate Offset : Worklist) {
+ Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
GenerateOffset(G, Offset);
}
}
}
}
- for (int64_t Offset : Worklist)
+ for (Immediate Offset : Worklist)
GenerateOffset(G, Offset);
- int64_t Imm = ExtractImmediate(G, SE);
- if (G->isZero() || Imm == 0)
+ Immediate Imm = ExtractImmediate(G, SE);
+ if (G->isZero() || Imm.isZero())
return;
Formula F = Base;
- F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+ F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() +
+ Imm.getFixedValue());
if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
return;
if (IsScaledReg) {
@@ -3964,7 +4042,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
Formula Base) {
// TODO: For now, just add the min and max offset, because it usually isn't
// worthwhile looking at everything inbetween.
- SmallVector<int64_t, 2> Worklist;
+ SmallVector<Immediate, 2> Worklist;
Worklist.push_back(LU.MinOffset);
if (LU.MaxOffset != LU.MinOffset)
Worklist.push_back(LU.MaxOffset);
@@ -4004,27 +4082,29 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
if (!ConstantInt::isValueValidForType(IntTy, Factor))
continue;
// Check that the multiplication doesn't overflow.
- if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
+ if (Base.BaseOffset.isMin() && Factor == -1)
continue;
- int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
+ Immediate NewBaseOffset =
+ Immediate::getFixed((uint64_t)Base.BaseOffset.getFixedValue() * Factor);
assert(Factor != 0 && "Zero factor not expected!");
- if (NewBaseOffset / Factor != Base.BaseOffset)
+ if (NewBaseOffset.getFixedValue() / Factor !=
+ Base.BaseOffset.getFixedValue())
continue;
// If the offset will be truncated at this use, check that it is in bounds.
if (!IntTy->isPointerTy() &&
- !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
+ !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
continue;
// Check that multiplying with the use offset doesn't overflow.
- int64_t Offset = LU.MinOffset;
- if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1)
+ Immediate Offset = LU.MinOffset;
+ if (Offset.isMin() && Factor == -1)
continue;
- Offset = (uint64_t)Offset * Factor;
- if (Offset / Factor != LU.MinOffset)
+ Offset = Immediate::getFixed((uint64_t)Offset.getFixedValue() * Factor);
+ if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
continue;
// If the offset will be truncated at this use, check that it is in bounds.
if (!IntTy->isPointerTy() &&
- !ConstantInt::isValueValidForType(IntTy, Offset))
+ !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
continue;
Formula F = Base;
@@ -4035,7 +4115,9 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
continue;
// Compensate for the use having MinOffset built into it.
- F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
+ F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() +
+ Offset.getFixedValue() -
+ LU.MinOffset.getFixedValue());
const SCEV *FactorS = SE.getConstant(IntTy, Factor);
@@ -4054,16 +4136,17 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
}
// Check that multiplying with the unfolded offset doesn't overflow.
- if (F.UnfoldedOffset != 0) {
- if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
- Factor == -1)
+ if (F.UnfoldedOffset.isNonZero()) {
+ if (F.UnfoldedOffset.isMin() && Factor == -1)
continue;
- F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
- if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+ F.UnfoldedOffset = Immediate::getFixed(
+ (uint64_t)F.UnfoldedOffset.getFixedValue() * Factor);
+ if (F.UnfoldedOffset.getFixedValue() / Factor !=
+ Base.UnfoldedOffset.getFixedValue())
continue;
// If the offset will be truncated, check that it is in bounds.
- if (!IntTy->isPointerTy() &&
- !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
+ if (!IntTy->isPointerTy() && !ConstantInt::isValueValidForType(
+ IntTy, F.UnfoldedOffset.getFixedValue()))
continue;
}
@@ -4106,8 +4189,8 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
}
// For an ICmpZero, negating a solitary base register won't lead to
// new solutions.
- if (LU.Kind == LSRUse::ICmpZero &&
- !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
+ if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
+ Base.BaseOffset.isZero() && !Base.BaseGV)
continue;
// For each addrec base reg, if its loop is current loop, apply the scale.
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
@@ -4233,10 +4316,10 @@ namespace {
/// structures moving underneath it.
struct WorkItem {
size_t LUIdx;
- int64_t Imm;
+ Immediate Imm;
const SCEV *OrigReg;
- WorkItem(size_t LI, int64_t I, const SCEV *R)
+ WorkItem(size_t LI, Immediate I, const SCEV *R)
: LUIdx(LI), Imm(I), OrigReg(R) {}
void print(raw_ostream &OS) const;
@@ -4260,14 +4343,14 @@ LLVM_DUMP_METHOD void WorkItem::dump() const {
/// opportunities between them.
void LSRInstance::GenerateCrossUseConstantOffsets() {
// Group the registers by their value without any added constant offset.
- using ImmMapTy = std::map<int64_t, const SCEV *>;
+ using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
DenseMap<const SCEV *, ImmMapTy> Map;
DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
SmallVector<const SCEV *, 8> Sequence;
for (const SCEV *Use : RegUses) {
const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
- int64_t Imm = ExtractImmediate(Reg, SE);
+ Immediate Imm = ExtractImmediate(Reg, SE);
auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
if (Pair.second)
Sequence.push_back(Reg);
@@ -4279,7 +4362,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
// a list of work to do and do the work in a separate step so that we're
// not adding formulae and register counts while we're searching.
SmallVector<WorkItem, 32> WorkItems;
- SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
+ SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
+ UniqueItems;
for (const SCEV *Reg : Sequence) {
const ImmMapTy &Imms = Map.find(Reg)->second;
@@ -4298,7 +4382,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
J != JE; ++J) {
const SCEV *OrigReg = J->second;
- int64_t JImm = J->first;
+ Immediate JImm = J->first;
const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
if (!isa<SCEVConstant>(OrigReg) &&
@@ -4310,8 +4394,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
// Conservatively examine offsets between this orig reg a few selected
// other orig regs.
- int64_t First = Imms.begin()->first;
- int64_t Last = std::prev(Imms.end())->first;
+ int64_t First = Imms.begin()->first.getFixedValue();
+ int64_t Last = std::prev(Imms.end())->first.getFixedValue();
// Compute (First + Last) / 2 without overflow using the fact that
// First + Last = 2 * (First + Last) + (First ^ Last).
int64_t Avg = (First & Last) + ((First ^ Last) >> 1);
@@ -4320,12 +4404,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));
ImmMapTy::const_iterator OtherImms[] = {
Imms.begin(), std::prev(Imms.end()),
- Imms.lower_bound(Avg)};
+ Imms.lower_bound(Immediate::getFixed(Avg))};
for (const auto &M : OtherImms) {
if (M == J || M == JE) continue;
// Compute the difference between the two.
- int64_t Imm = (uint64_t)JImm - M->first;
+ Immediate Imm = Immediate::getFixed((uint64_t)JImm.getFixedValue() -
+ M->first.getFixedValue());
for (unsigned LUIdx : UsedByIndices.set_bits())
// Make a memo of this use, offset, and register tuple.
if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
@@ -4343,11 +4428,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
for (const WorkItem &WI : WorkItems) {
size_t LUIdx = WI.LUIdx;
LSRUse &LU = Uses[LUIdx];
- int64_t Imm = WI.Imm;
+ Immediate Imm = WI.Imm;
const SCEV *OrigReg = WI.OrigReg;
Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
- const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
+ const SCEV *NegImmS =
+ SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getFixedValue()));
unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
// TODO: Use a more targeted data structure.
@@ -4360,10 +4446,12 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
F.unscale();
// Use the immediate in the scaled register.
if (F.ScaledReg == OrigReg) {
- int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
+ Immediate Offset =
+ Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() +
+ Imm.getFixedValue() * (uint64_t)F.Scale);
// Don't create 50 + reg(-50).
if (F.referencesReg(SE.getSCEV(
- ConstantInt::get(IntTy, -(uint64_t)Offset))))
+ ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue()))))
continue;
Formula NewF = F;
NewF.BaseOffset = Offset;
@@ -4376,9 +4464,10 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
// value to the immediate would produce a value closer to zero than the
// immediate itself, then the formula isn't worthwhile.
if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
- if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
+ if (C->getValue()->isNegative() !=
+ (NewF.BaseOffset.isLessThanZero()) &&
(C->getAPInt().abs() * APInt(BitWidth, F.Scale))
- .ule(std::abs(NewF.BaseOffset)))
+ .ule(std::abs(NewF.BaseOffset.getFixedValue())))
continue;
// OK, looks good.
@@ -4391,16 +4480,21 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
if (BaseReg != OrigReg)
continue;
Formula NewF = F;
- NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
+ NewF.BaseOffset = Immediate::getFixed(
+ (uint64_t)NewF.BaseOffset.getFixedValue() + Imm.getFixedValue());
if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
LU.Kind, LU.AccessTy, NewF)) {
if (AMK == TTI::AMK_PostIndexed &&
mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
continue;
- if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+ if (!TTI.isLegalAddImmediate(
+ (uint64_t)NewF.UnfoldedOffset.getFixedValue() +
+ Imm.getFixedValue()))
continue;
NewF = F;
- NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+ NewF.UnfoldedOffset = Immediate::getFixed(
+ (uint64_t)NewF.UnfoldedOffset.getFixedValue() +
+ Imm.getFixedValue());
}
NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
@@ -4409,11 +4503,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
// zero than the immediate itself, then the formula isn't worthwhile.
for (const SCEV *NewReg : NewF.BaseRegs)
if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
- if ((C->getAPInt() + NewF.BaseOffset)
+ if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
.abs()
- .slt(std::abs(NewF.BaseOffset)) &&
- (C->getAPInt() + NewF.BaseOffset).countr_zero() >=
- (unsigned)llvm::countr_zero<uint64_t>(NewF.BaseOffset))
+ .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
+ (C->getAPInt() + NewF.BaseOffset.getFixedValue())
+ .countr_zero() >=
+ (unsigned)llvm::countr_zero<uint64_t>(
+ NewF.BaseOffset.getFixedValue()))
goto skip_formula;
// Ok, looks good.
@@ -4607,7 +4703,9 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
Formula NewF = F;
//FIXME: Formulas should store bitwidth to do wrapping properly.
// See PR41034.
- NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue();
+ NewF.BaseOffset =
+ Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
+ (uint64_t)C->getValue()->getSExtValue());
NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
(I - F.BaseRegs.begin()));
if (LU.HasFormulaWithSameRegs(NewF)) {
@@ -4663,7 +4761,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
for (const Formula &F : LU.Formulae) {
- if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
+ if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
continue;
LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
@@ -5487,30 +5585,33 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
}
// Expand the immediate portion.
- int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
- if (Offset != 0) {
+ Immediate Offset = Immediate::getFixed(
+ (uint64_t)F.BaseOffset.getFixedValue() + LF.Offset.getFixedValue());
+ if (Offset.isNonZero()) {
if (LU.Kind == LSRUse::ICmpZero) {
// The other interesting way of "folding" with an ICmpZero is to use a
// negated immediate.
if (!ICmpScaledV)
- ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
+ ICmpScaledV =
+ ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
else {
Ops.push_back(SE.getUnknown(ICmpScaledV));
- ICmpScaledV = ConstantInt::get(IntTy, Offset);
+ ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
}
} else {
// Just add the immediate values. These again are expected to be matched
// as part of the address.
- Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
+ Ops.push_back(
+ SE.getUnknown(ConstantInt::getSigned(IntTy, Offset.getFixedValue())));
}
}
// Expand the unfolded offset portion.
- int64_t UnfoldedOffset = F.UnfoldedOffset;
- if (UnfoldedOffset != 0) {
+ Immediate UnfoldedOffset = F.UnfoldedOffset;
+ if (UnfoldedOffset.isNonZero()) {
// Just add the immediate values.
- Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
- UnfoldedOffset)));
+ Ops.push_back(SE.getUnknown(
+ ConstantInt::getSigned(IntTy, UnfoldedOffset.getFixedValue())));
}
// Emit instructions summing all the operands.
@@ -5546,7 +5647,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
"ICmp does not support folding a global value and "
"a scale at the same time!");
Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
- -(uint64_t)Offset);
+ -(uint64_t)Offset.getFixedValue());
if (C->getType() != OpTy) {
C = ConstantFoldCastOperand(
CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
>From 37ae026c57cef66a816c0a6a760aba7fcc0b9fc8 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 14 May 2024 16:04:13 +0000
Subject: [PATCH 3/3] Scalable work
---
.../Transforms/Scalar/LoopStrengthReduce.cpp | 226 +++++++++++++-----
.../AArch64/vscale-fixups.ll | 23 +-
2 files changed, 182 insertions(+), 67 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 00beec1a0637f..a0dff775ddec4 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -197,6 +197,14 @@ static cl::opt<bool> AllowDropSolutionIfLessProfitable(
"lsr-drop-solution", cl::Hidden, cl::init(false),
cl::desc("Attempt to drop solution if it is less profitable"));
+static cl::opt<bool> EnableVScaleImmediates(
+ "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
+ cl::desc("Enable analysis of vscale-relative immediates in LSR"));
+
+static cl::opt<bool> DropScaledForVScale(
+ "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
+ cl::desc("Avoid using scaled registers with vscale-relative addressing"));
+
STATISTIC(NumTermFold,
"Number of terminating condition fold recognized and performed");
@@ -880,7 +888,13 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
// FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
SCEV::FlagAnyWrap);
return Result;
- }
+ } else if (EnableVScaleImmediates)
+ if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
+ if (isa<SCEVVScale>(M->getOperand(1))) {
+ S = SE.getConstant(M->getType(), 0);
+ return Immediate::getScalable(C->getValue()->getSExtValue());
+ }
return Immediate();
}
@@ -1472,14 +1486,18 @@ void Cost::RateFormula(const Formula &F,
// Tally up the non-zero immediates.
for (const LSRFixup &Fixup : LU.Fixups) {
- Immediate O = Fixup.Offset;
- Immediate Offset = Immediate::getFixed((uint64_t)O.getFixedValue() +
- F.BaseOffset.getFixedValue());
+ // FIXME: We probably want to noticeably increase the cost if the
+ // two offsets differ in scalability?
+ bool Scalable = Fixup.Offset.isScalable() || F.BaseOffset.isScalable();
+ int64_t O = Fixup.Offset.getKnownMinValue();
+ Immediate Offset = Immediate::get(
+ (uint64_t)(O) + F.BaseOffset.getKnownMinValue(), Scalable);
if (F.BaseGV)
C.ImmCost += 64; // Handle symbolic values conservatively.
// TODO: This should probably be the pointer size.
else if (Offset.isNonZero())
- C.ImmCost += APInt(64, Offset.getFixedValue(), true).getSignificantBits();
+ C.ImmCost +=
+ APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
// Check with target if this offset with this instruction is
// specifically not supported.
@@ -1742,11 +1760,15 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
bool HasBaseReg, int64_t Scale,
Instruction *Fixup /*= nullptr*/) {
switch (Kind) {
- case LSRUse::Address:
- return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV,
- BaseOffset.getFixedValue(), HasBaseReg,
- Scale, AccessTy.AddrSpace, Fixup);
-
+ case LSRUse::Address: {
+ int64_t FixedOffset =
+ BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
+ int64_t ScalableOffset =
+ BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
+ return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
+ HasBaseReg, Scale, AccessTy.AddrSpace,
+ Fixup, ScalableOffset);
+ }
case LSRUse::ICmpZero:
// There's not even a target hook for querying whether it would be legal to
// fold a GV into an ICmp.
@@ -1796,19 +1818,20 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
LSRUse::KindType Kind, MemAccessTy AccessTy,
GlobalValue *BaseGV, Immediate BaseOffset,
bool HasBaseReg, int64_t Scale) {
- int64_t Base = BaseOffset.getFixedValue();
- int64_t Min = MinOffset.getFixedValue();
- int64_t Max = MaxOffset.getFixedValue();
+ if (BaseOffset.isNonZero() &&
+ (BaseOffset.isScalable() != MinOffset.isScalable() ||
+ BaseOffset.isScalable() != MaxOffset.isScalable()))
+ return false;
// Check for overflow.
+ int64_t Base = BaseOffset.getKnownMinValue();
+ int64_t Min = MinOffset.getKnownMinValue();
+ int64_t Max = MaxOffset.getKnownMinValue();
if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
return false;
- Min = (uint64_t)Base + Min;
+ MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
return false;
- Max = (uint64_t)Base + Max;
-
- MinOffset = Immediate::getFixed(Min);
- MaxOffset = Immediate::getFixed(Max);
+ MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
HasBaseReg, Scale) &&
@@ -1854,6 +1877,14 @@ static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
F.BaseOffset, F.HasBaseReg, F.Scale);
}
+static bool isLegalAddImmediate(const TargetTransformInfo &TTI,
+ Immediate Offset) {
+ if (Offset.isScalable())
+ return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
+
+ return TTI.isLegalAddImmediate(Offset.getFixedValue());
+}
+
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
const LSRUse &LU, const Formula &F) {
// Target may want to look at the user instructions.
@@ -1885,14 +1916,20 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
switch (LU.Kind) {
case LSRUse::Address: {
- int64_t FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
- int64_t FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
// Check the scaling factor cost with both the min and max offsets.
+ int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
+ if (F.BaseOffset.isScalable()) {
+ ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
+ ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
+ } else {
+ FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
+ FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
+ }
InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
- LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMin),
+ LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
- LU.AccessTy.MemTy, F.BaseGV, StackOffset::getFixed(FixedMax),
+ LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
@@ -1929,6 +1966,15 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
HasBaseReg = true;
}
+ // FIXME: Try with + without a scale? Maybe based on TTI?
+ // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
+ // default for many architectures, not just AArch64 SVE. More investigation
+ // needed later to determine if this should be used more widely than just
+ // on scalable types.
+ if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
+ AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
+ Scale = 0;
+
return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
HasBaseReg, Scale);
}
@@ -1953,6 +1999,9 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
if (BaseOffset.isZero() && !BaseGV)
return true;
+ if (BaseOffset.isScalable())
+ return false;
+
// Conservatively, create an address with an immediate and a
// base and a scale.
int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
@@ -2674,6 +2723,13 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
NewMaxOffset = NewOffset;
}
+ // FIXME: We should be able to handle some level of scalable offset support
+ // for 'void', but in order to get basic support up and running this is
+ // being left out.
+ if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
+ (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
+ return false;
+
// Update the use.
LU.MinOffset = NewMinOffset;
LU.MaxOffset = NewMaxOffset;
@@ -3805,6 +3861,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
// Add the remaining pieces of the add back into the new formula.
const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+ // FIXME: Scalable immediates!!!
if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
InnerSumSC->getValue()->getZExtValue())) {
@@ -3966,13 +4023,22 @@ void LSRInstance::GenerateConstantOffsetsImpl(
auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
Formula F = Base;
- F.BaseOffset = Immediate::getFixed(
- (uint64_t)Base.BaseOffset.getFixedValue() - Offset.getFixedValue());
+ if (Base.BaseOffset.isScalable() != Offset.isScalable() &&
+ Base.BaseOffset.isNonZero() && Offset.isNonZero())
+ return;
+ bool Scalable = Base.BaseOffset.isScalable() || Offset.isScalable();
+ F.BaseOffset = Immediate::get((uint64_t)Base.BaseOffset.getKnownMinValue() -
+ Offset.getKnownMinValue(),
+ Scalable);
if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
// Add the offset to the base register.
- const SCEV *NewG = SE.getAddExpr(
- SE.getConstant(G->getType(), Offset.getFixedValue()), G);
+ const SCEV *NewOffset =
+ SE.getConstant(G->getType(), Offset.getKnownMinValue());
+ if (Scalable)
+ NewOffset =
+ SE.getMulExpr(NewOffset, SE.getVScale(NewOffset->getType()));
+ const SCEV *NewG = SE.getAddExpr(NewOffset, G);
// If it cancelled out, drop the base register, otherwise update it.
if (NewG->isZero()) {
if (IsScaledReg) {
@@ -4019,11 +4085,13 @@ void LSRInstance::GenerateConstantOffsetsImpl(
GenerateOffset(G, Offset);
Immediate Imm = ExtractImmediate(G, SE);
- if (G->isZero() || Imm.isZero())
+ if (G->isZero() || Imm.isZero() ||
+ Base.BaseOffset.isScalable() != Imm.isScalable())
return;
Formula F = Base;
- F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() +
- Imm.getFixedValue());
+ F.BaseOffset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() +
+ Imm.getKnownMinValue(),
+ Imm.isScalable());
if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
return;
if (IsScaledReg) {
@@ -4394,23 +4462,39 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
// Conservatively examine offsets between this orig reg a few selected
// other orig regs.
- int64_t First = Imms.begin()->first.getFixedValue();
- int64_t Last = std::prev(Imms.end())->first.getFixedValue();
+ Immediate First = Imms.begin()->first;
+ Immediate Last = std::prev(Imms.end())->first;
+ if (First.isScalable() != Last.isScalable() && First.isNonZero() &&
+ Last.isNonZero()) {
+ LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
+ << "\n");
+ continue;
+ }
+ // Only scalable if both terms are scalable, or if one is scalable and
+ // the other is 0.
+ bool Scalable = First.isScalable() || Last.isScalable();
+ int64_t FI = First.getKnownMinValue();
+ int64_t LI = Last.getKnownMinValue();
// Compute (First + Last) / 2 without overflow using the fact that
// First + Last = 2 * (First + Last) + (First ^ Last).
- int64_t Avg = (First & Last) + ((First ^ Last) >> 1);
- // If the result is negative and First is odd and Last even (or vice versa),
+ int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
+ // If the result is negative and FI is odd and LI even (or vice versa),
// we rounded towards -inf. Add 1 in that case, to round towards 0.
- Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));
+ Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
ImmMapTy::const_iterator OtherImms[] = {
Imms.begin(), std::prev(Imms.end()),
- Imms.lower_bound(Immediate::getFixed(Avg))};
+ Imms.lower_bound(Immediate::get(Avg, Scalable))};
for (const auto &M : OtherImms) {
if (M == J || M == JE) continue;
+ if (JImm.isScalable() != M->first.isScalable() && JImm.isNonZero() &&
+ M->first.isNonZero())
+ continue;
// Compute the difference between the two.
- Immediate Imm = Immediate::getFixed((uint64_t)JImm.getFixedValue() -
- M->first.getFixedValue());
+ bool Scalable = JImm.isScalable() || M->first.isScalable();
+ Immediate Imm = Immediate::get((uint64_t)JImm.getKnownMinValue() -
+ M->first.getKnownMinValue(),
+ Scalable);
for (unsigned LUIdx : UsedByIndices.set_bits())
// Make a memo of this use, offset, and register tuple.
if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
@@ -4433,7 +4517,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
const SCEV *NegImmS =
- SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getFixedValue()));
+ SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getKnownMinValue()));
+ if (Imm.isScalable())
+ NegImmS = SE.getMulExpr(NegImmS, SE.getVScale(NegImmS->getType()));
unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
// TODO: Use a more targeted data structure.
@@ -4446,12 +4532,20 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
F.unscale();
// Use the immediate in the scaled register.
if (F.ScaledReg == OrigReg) {
+ if (F.BaseOffset.isScalable() != Imm.isScalable() &&
+ F.BaseOffset.isNonZero() && Imm.isNonZero())
+ continue;
+ bool Scalable = F.BaseOffset.isScalable() || Imm.isScalable();
Immediate Offset =
- Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() +
- Imm.getFixedValue() * (uint64_t)F.Scale);
+ Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() +
+ Imm.getKnownMinValue() * (uint64_t)F.Scale,
+ Scalable);
// Don't create 50 + reg(-50).
- if (F.referencesReg(SE.getSCEV(
- ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue()))))
+ const SCEV *S = SE.getSCEV(
+ ConstantInt::get(IntTy, -(uint64_t)Offset.getKnownMinValue()));
+ if (Scalable)
+ S = SE.getMulExpr(S, SE.getVScale(S->getType()));
+ if (F.referencesReg(S))
continue;
Formula NewF = F;
NewF.BaseOffset = Offset;
@@ -4480,21 +4574,27 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
if (BaseReg != OrigReg)
continue;
Formula NewF = F;
- NewF.BaseOffset = Immediate::getFixed(
- (uint64_t)NewF.BaseOffset.getFixedValue() + Imm.getFixedValue());
+ if (NewF.BaseOffset.isScalable() != Imm.isScalable() &&
+ NewF.BaseOffset.isNonZero() && Imm.isNonZero())
+ continue;
+ bool Scalable = NewF.BaseOffset.isScalable() || Imm.isScalable();
+ NewF.BaseOffset =
+ Immediate::get((uint64_t)NewF.BaseOffset.getKnownMinValue() +
+ Imm.getKnownMinValue(),
+ Scalable);
if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
LU.Kind, LU.AccessTy, NewF)) {
if (AMK == TTI::AMK_PostIndexed &&
mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
continue;
- if (!TTI.isLegalAddImmediate(
- (uint64_t)NewF.UnfoldedOffset.getFixedValue() +
- Imm.getFixedValue()))
+ Immediate NewUnfoldedOffset = Immediate::get(
+ (uint64_t)NewF.UnfoldedOffset.getKnownMinValue() +
+ Imm.getKnownMinValue(),
+ Scalable);
+ if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
continue;
NewF = F;
- NewF.UnfoldedOffset = Immediate::getFixed(
- (uint64_t)NewF.UnfoldedOffset.getFixedValue() +
- Imm.getFixedValue());
+ NewF.UnfoldedOffset = NewUnfoldedOffset;
}
NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
@@ -5584,9 +5684,17 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
Ops.push_back(SE.getUnknown(FullV));
}
+ // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
+ // out at this point, or should we generate a SCEV adding together mixed
+ // offsets?
+ assert((F.BaseOffset.isScalable() == LF.Offset.isScalable() ||
+ F.BaseOffset.isZero() || LF.Offset.isZero()) &&
+ "Expanding mismatched offsets\n");
+ bool Scalable = F.BaseOffset.isScalable() || LF.Offset.isScalable();
// Expand the immediate portion.
- Immediate Offset = Immediate::getFixed(
- (uint64_t)F.BaseOffset.getFixedValue() + LF.Offset.getFixedValue());
+ Immediate Offset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() +
+ LF.Offset.getKnownMinValue(),
+ Scalable);
if (Offset.isNonZero()) {
if (LU.Kind == LSRUse::ICmpZero) {
// The other interesting way of "folding" with an ICmpZero is to use a
@@ -5601,17 +5709,23 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
} else {
// Just add the immediate values. These again are expected to be matched
// as part of the address.
- Ops.push_back(
- SE.getUnknown(ConstantInt::getSigned(IntTy, Offset.getFixedValue())));
+ const SCEV *SU = SE.getUnknown(
+ ConstantInt::getSigned(IntTy, Offset.getKnownMinValue()));
+ if (Scalable)
+ SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
+ Ops.push_back(SU);
}
}
// Expand the unfolded offset portion.
Immediate UnfoldedOffset = F.UnfoldedOffset;
if (UnfoldedOffset.isNonZero()) {
+ const SCEV *SU = SE.getUnknown(
+ ConstantInt::getSigned(IntTy, UnfoldedOffset.getKnownMinValue()));
+ if (UnfoldedOffset.isScalable())
+ SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
// Just add the immediate values.
- Ops.push_back(SE.getUnknown(
- ConstantInt::getSigned(IntTy, UnfoldedOffset.getFixedValue())));
+ Ops.push_back(SU);
}
// Emit instructions summing all the operands.
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index 4652d0d83919c..de640fa2516a6 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -10,15 +10,16 @@ define void @mulvl123_addressing(ptr %src, ptr %dst, i64 %count) #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[VSCALE]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw i64 [[VSCALE]], 48
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[VSCALE]], 80
-; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[VSCALE]], 5
+; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[VSCALE]], 48
+; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[VSCALE]], 5
+; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[VSCALE]], 4
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[SRC]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = load <vscale x 16 x i8>, ptr [[LSR_IV]], align 16
-; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP0]]
+; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP3]], align 16
; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP7:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP2]], align 16
@@ -73,12 +74,9 @@ define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i
; CHECK-NEXT: entry:
; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[MUL:%.*]] = shl i64 [[VSCALE]], 5
-; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[VSCALE]], 4
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC_ROWS]], i64 [[STRIDE]]
-; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[TMP0]]
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[SRC_ROWS]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[VSCALE]], 3
-; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[DST_ROWS]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[VSCALE]], 4
+; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[VSCALE]], 3
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[LSR_IV10:%.*]] = phi i64 [ [[LSR_IV_NEXT11:%.*]], [[FOR_BODY]] ], [ [[COUNT]], [[ENTRY:%.*]] ]
@@ -86,11 +84,13 @@ define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i
; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[SRC_ROWS]], i64 [[LSR_IV]]
; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[DST_ROWS]], i64 [[LSR_IV]]
; CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP6]], align 16
-; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[SRC_ROWS]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP5]], align 16
; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[LSR_IV]]
; CHECK-NEXT: [[TMP4:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP3]], align 16
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = load <vscale x 16 x i8>, ptr [[SCEVGEP2]], align 16
; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 16 x i8> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 16 x i8> [[TMP3]], [[TMP5]]
@@ -98,7 +98,8 @@ define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i
; CHECK-NEXT: [[TMP9:%.*]] = trunc <vscale x 8 x i16> [[TMP8]] to <vscale x 8 x i8>
; CHECK-NEXT: store <vscale x 8 x i8> [[TMP9]], ptr [[SCEVGEP9]], align 8
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <vscale x 16 x i8> [[TMP7]] to <vscale x 8 x i16>
-; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[SCEVGEP7]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[DST_ROWS]], i64 [[LSR_IV]]
+; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[SCEVGEP7]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP11:%.*]] = trunc <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x i8>
; CHECK-NEXT: store <vscale x 8 x i8> [[TMP11]], ptr [[SCEVGEP8]], align 8
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], [[MUL]]
More information about the llvm-commits
mailing list