[llvm] 4ddc8df - [CostModel][ARM]Adjust cost of muls in (U/S)MLAL and patterns (#122713)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 19 05:25:48 PDT 2025
Author: Nashe Mncube
Date: 2025-03-19T12:25:44Z
New Revision: 4ddc8df6ca1c9c1a1f03f03baefba5d1f6e2e78b
URL: https://github.com/llvm/llvm-project/commit/4ddc8df6ca1c9c1a1f03f03baefba5d1f6e2e78b
DIFF: https://github.com/llvm/llvm-project/commit/4ddc8df6ca1c9c1a1f03f03baefba5d1f6e2e78b.diff
LOG: [CostModel][ARM]Adjust cost of muls in (U/S)MLAL and patterns (#122713)
PR #117350 made changes to the SLP vectorizer which introduced a
regression on some ARM benchmarks. Investigation narrowed it down to
suboptimal codegen for benchmarks that previously only used scalar (U/S)MLAL
instructions. The linked change meant the SLPVectorizer thought that
these could be vectorized. This change makes the cost of muls in
(U/S)MLAL patterns slightly cheaper to make sure scalar instructions are
preferred in these cases over SLP vectorization on targets supporting DSP
Added:
llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
Modified:
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index cc8a6d9449a05..dc2909e146dcd 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1458,16 +1458,73 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost(
if (LooksLikeAFreeShift())
return 0;
+ // When targets have both DSP and MVE we find that the
+ // the compiler will attempt to vectorize as well as using
+ // scalar (S/U)MLAL operations. This is in cases where we have
+ // the pattern ext(mul(ext(i16), ext(i16))) we find
+ // that codegen performs better when only using (S/U)MLAL scalar
+ // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
+ // check if a mul instruction is used in a (U/S)MLAL pattern.
+ auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
+ Type *Ty) -> bool {
+ if (!ST->hasDSP())
+ return false;
+
+ if (!I)
+ return false;
+
+ if (Opcode != Instruction::Mul)
+ return false;
+
+ if (Ty->isVectorTy())
+ return false;
+
+ auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
+ return cast<Instruction>(LHS)->getOpcode() ==
+ cast<Instruction>(RHS)->getOpcode();
+ };
+ auto IsExtInst = [](const Value *V) -> bool {
+ return isa<ZExtInst>(V) || isa<SExtInst>(V);
+ };
+ auto IsExtensionFromHalf = [&, IsExtInst](const Value *V) -> bool {
+ return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
+ };
+
+ // We check the arguments of the instruction to see if they're extends
+ auto *BinOp = dyn_cast<BinaryOperator>(I);
+ if (!BinOp)
+ return false;
+ Value *Op0 = BinOp->getOperand(0);
+ Value *Op1 = BinOp->getOperand(1);
+ if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
+ // We're interested in an ext of an i16
+ if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
+ !IsExtensionFromHalf(Op1))
+ return false;
+ // We need to check if this result will be further extended to i64
+ // and that all these uses are SExt
+ for (auto *U : I->users())
+ if (!IsExtInst(U))
+ return false;
+ return true;
+ }
+
+ return false;
+ };
+
+ if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
+ return 0;
+
// Default to cheap (throughput/size of 1 instruction) but adjust throughput
// for "multiple beats" potentially needed by MVE instructions.
int BaseCost = 1;
if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
BaseCost = ST->getMVEVectorCostFactor(CostKind);
- // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
- // without treating floats as more expensive that scalars or increasing the
- // costs for custom operations. The results is also multiplied by the
- // MVEVectorCostFactor where appropriate.
+ // The rest of this mostly follows what is done in
+ // BaseT::getArithmeticInstrCost, without treating floats as more expensive
+ // that scalars or increasing the costs for custom operations. The results is
+ // also multiplied by the MVEVectorCostFactor where appropriate.
if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
return LT.first * BaseCost;
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
new file mode 100644
index 0000000000000..7de2799d5af9c
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+
+define i64 @test(i16 %a, i16 %b) {
+; CHECK-LABEL: 'test'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+;
+; CHECK-NO-DSP-LABEL: 'test'
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+;
+ %as = sext i16 %a to i32
+ %bs = sext i16 %b to i32
+ %m = mul i32 %as, %bs
+ %ms = sext i32 %m to i64
+ ret i64 %ms
+}
+
+define i64 @withadd(i16 %a, i16 %b, i64 %c) {
+; CHECK-LABEL: 'withadd'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+; CHECK-NO-DSP-LABEL: 'withadd'
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+ %as = sext i16 %a to i32
+ %bs = sext i16 %b to i32
+ %m = mul i32 %as, %bs
+ %ms = sext i32 %m to i64
+ %r = add i64 %c, %ms
+ ret i64 %r
+}
+
+define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
+; CHECK-LABEL: 'withloads'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+; CHECK-NO-DSP-LABEL: 'withloads'
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+ %a = load i16, ptr %pa
+ %b = load i16, ptr %pb
+ %as = sext i16 %a to i32
+ %bs = sext i16 %b to i32
+ %m = mul i32 %as, %bs
+ %ms = sext i32 %m to i64
+ %r = add i64 %c, %ms
+ ret i64 %r
+}
+
+define i64 @
diff erent_extend_ops(i16 %a, i16 %b) {
+; CHECK-LABEL: '
diff erent_extend_ops'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+;
+; CHECK-NO-DSP-LABEL: '
diff erent_extend_ops'
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+;
+ %as = sext i16 %a to i32
+ %bs = zext i16 %b to i32
+ %m = mul i32 %as, %bs
+ %ms = sext i32 %m to i64
+ ret i64 %ms
+}
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
new file mode 100644
index 0000000000000..521816d13000b
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+define i64 @test(i16 %a, i16 %b) {
+; CHECK-LABEL: 'test'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+;
+; CHECK-NO-DSP-LABEL: 'test'
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+;
+ %as = zext i16 %a to i32
+ %bs = zext i16 %b to i32
+ %m = mul i32 %as, %bs
+ %ms = zext i32 %m to i64
+ ret i64 %ms
+}
+
+define i64 @withadd(i16 %a, i16 %b, i64 %c) {
+; CHECK-LABEL: 'withadd'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+; CHECK-NO-DSP-LABEL: 'withadd'
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+ %as = zext i16 %a to i32
+ %bs = zext i16 %b to i32
+ %m = mul i32 %as, %bs
+ %ms = zext i32 %m to i64
+ %r = add i64 %c, %ms
+ ret i64 %r
+}
+
+define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
+; CHECK-LABEL: 'withloads'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+; CHECK-NO-DSP-LABEL: 'withloads'
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
+ %a = load i16, ptr %pa
+ %b = load i16, ptr %pb
+ %as = zext i16 %a to i32
+ %bs = zext i16 %b to i32
+ %m = mul i32 %as, %bs
+ %ms = zext i32 %m to i64
+ %r = add i64 %c, %ms
+ ret i64 %r
+}
More information about the llvm-commits
mailing list