[llvm] 7d82c99 - [RISCV][TTI] Account for constant materialization cost when costing arithmetic operations

Wed Nov 30 07:20:58 PST 2022

Author: Philip Reames
Date: 2022-11-30T07:20:51-08:00
New Revision: 7d82c99403f615f6236334e698720bf979959704

URL: https://github.com/llvm/llvm-project/commit/7d82c99403f615f6236334e698720bf979959704
DIFF: https://github.com/llvm/llvm-project/commit/7d82c99403f615f6236334e698720bf979959704.diff

LOG: [RISCV][TTI] Account for constant materialization cost when costing arithmetic operations

At the IR level, we generally assume that constants are free to materialize. However, for RISCV due to some quirks of the ISA, materializing arbitrary constants can be rather expensive. We frequently fallback to constant pool loads.

We've been slowly moving in the direction of modeling the cost of the remat as part of the instruction cost. This has the effect of disincentivizing vectorization - mostly SLP - when we'd have to materialize an expensive constant.

We need better modeling of which constants are expensive and not, but the moment let's be consistent with how we model arithmetic and memory instructions. The difference between the two is that arithmetic can sometimes fold a splat operation which stores can not.

Differential Revision: https://reviews.llvm.org/D138941

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/RISCV/arith-int.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ab981c1638f8..34b0ec46f2c7 100644

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1283,11 +1283,8 @@ bool RISCVTargetLowering::
   return !XC;
 }
 
-bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const {
-  if (!I->getType()->isVectorTy() || !Subtarget.hasVInstructions())
-    return false;
-
-  switch (I->getOpcode()) {
+bool RISCVTargetLowering::canSplatOperand(unsigned Opcode, int Operand) const {
+  switch (Opcode) {
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Mul:
@@ -1309,38 +1306,48 @@ bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const {
   case Instruction::URem:
   case Instruction::SRem:
     return Operand == 1;
-  case Instruction::Call:
-    if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-      switch (II->getIntrinsicID()) {
-      case Intrinsic::fma:
-      case Intrinsic::vp_fma:
-        return Operand == 0 || Operand == 1;
-      case Intrinsic::vp_shl:
-      case Intrinsic::vp_lshr:
-      case Intrinsic::vp_ashr:
-      case Intrinsic::vp_udiv:
-      case Intrinsic::vp_sdiv:
-      case Intrinsic::vp_urem:
-      case Intrinsic::vp_srem:
-        return Operand == 1;
-        // These intrinsics are commutative.
-      case Intrinsic::vp_add:
-      case Intrinsic::vp_mul:
-      case Intrinsic::vp_and:
-      case Intrinsic::vp_or:
-      case Intrinsic::vp_xor:
-      case Intrinsic::vp_fadd:
-      case Intrinsic::vp_fmul:
-        // These intrinsics have 'vr' versions.
-      case Intrinsic::vp_sub:
-      case Intrinsic::vp_fsub:
-      case Intrinsic::vp_fdiv:
-        return Operand == 0 || Operand == 1;
-      default:
-        return false;
-      }
-    }
+  default:
     return false;
+  }
+}
+
+
+bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const {
+  if (!I->getType()->isVectorTy() || !Subtarget.hasVInstructions())
+    return false;
+
+  if (canSplatOperand(I->getOpcode(), Operand))
+    return true;
+
+  auto *II = dyn_cast<IntrinsicInst>(I);
+  if (!II)
+    return false;
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::fma:
+  case Intrinsic::vp_fma:
+    return Operand == 0 || Operand == 1;
+  case Intrinsic::vp_shl:
+  case Intrinsic::vp_lshr:
+  case Intrinsic::vp_ashr:
+  case Intrinsic::vp_udiv:
+  case Intrinsic::vp_sdiv:
+  case Intrinsic::vp_urem:
+  case Intrinsic::vp_srem:
+    return Operand == 1;
+    // These intrinsics are commutative.
+  case Intrinsic::vp_add:
+  case Intrinsic::vp_mul:
+  case Intrinsic::vp_and:
+  case Intrinsic::vp_or:
+  case Intrinsic::vp_xor:
+  case Intrinsic::vp_fadd:
+  case Intrinsic::vp_fmul:
+    // These intrinsics have 'vr' versions.
+  case Intrinsic::vp_sub:
+  case Intrinsic::vp_fsub:
+  case Intrinsic::vp_fdiv:
+    return Operand == 0 || Operand == 1;
   default:
     return false;
   }

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 56027c8ad937..807f3eaff9fd 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -365,6 +365,9 @@ class RISCVTargetLowering : public TargetLowering {
   /// Return true if the (vector) instruction I will be lowered to an instruction
   /// with a scalar splat operand for the given Operand number.
   bool canSplatOperand(Instruction *I, int Operand) const;
+  /// Return true if a vector instruction will lower to a target instruction
+  /// able to splat the given operand.
+  bool canSplatOperand(unsigned Opcode, int Operand) const;
   bool shouldSinkOperands(Instruction *I,
                           SmallVectorImpl<Use *> &Ops) const override;
   bool shouldScalarizeBinop(SDValue VecOp) const override;

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 28a8be7631fd..469e5d051805 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1079,6 +1079,31 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
                                          Args, CxtI);
 
+
+  auto getConstantMatCost =
+    [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
+    if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
+      // Two sub-cases:
+      // * Has a 5 bit immediate operand which can be splatted.
+      // * Has a larger immediate which must be materialized in scalar register
+      // We return 0 for both as we currently ignore the cost of materializing
+      // scalar constants in GPRs.
+      return 0;
+
+    // Add a cost of address generation + the cost of the vector load. The
+    // address is expected to be a PC relative offset to a constant pool entry
+    // using auipc/addi.
+    return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
+                               /*AddressSpace=*/0, CostKind);
+  };
+
+  // Add the cost of materializing any constant vectors required.
+  InstructionCost ConstantMatCost = 0;
+  if (Op1Info.isConstant())
+    ConstantMatCost += getConstantMatCost(0, Op1Info);
+  if (Op2Info.isConstant())
+    ConstantMatCost += getConstantMatCost(1, Op2Info);
+
   switch (TLI->InstructionOpcodeToISD(Opcode)) {
   case ISD::ADD:
   case ISD::SUB:
@@ -1095,13 +1120,12 @@ InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
   case ISD::FSUB:
   case ISD::FMUL:
   case ISD::FNEG: {
-    // TODO: Add the cost of materializing any constant vectors required since
-    // we otherwise treat constants as no-cost.
     // TODO: We should be accounting for LMUL and scaling costs for LMUL > 1.
-    return LT.first * 1;
+    return ConstantMatCost + LT.first * 1;
   }
   default:
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
+    return ConstantMatCost +
+           BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
                                          Args, CxtI);
   }
 }

diff  --git a/llvm/test/Analysis/CostModel/RISCV/arith-int.ll b/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
index fe5e68f1ac78..3558094b01b2 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-int.ll
@@ -815,14 +815,14 @@ define void @add_of_constant() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = add <2 x i64> <i64 1, i64 1>, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = add <4 x i32> <i32 4096, i32 4096, i32 4096, i32 4096>, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = add <4 x i32> <i32 1, i32 1, i32 2, i32 1>, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = add <4 x i32> <i32 2, i32 1, i32 1, i32 1>, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = add <4 x i32> <i32 0, i32 1, i32 2, i32 3>, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = add <4 x i32> <i32 1, i32 2, i32 3, i32 4>, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = add <4 x i32> <i32 -1, i32 -2, i32 -3, i32 -4>, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = add <4 x i32> <i32 2, i32 4, i32 6, i32 8>, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = add <4 x i32> <i32 -1, i32 0, i32 2, i32 1>, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = add <4 x i32> <i32 256, i32 4096, i32 57, i32 1>, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %8 = add <4 x i32> <i32 1, i32 1, i32 2, i32 1>, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %9 = add <4 x i32> <i32 2, i32 1, i32 1, i32 1>, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %10 = add <4 x i32> <i32 0, i32 1, i32 2, i32 3>, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = add <4 x i32> <i32 1, i32 2, i32 3, i32 4>, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = add <4 x i32> <i32 -1, i32 -2, i32 -3, i32 -4>, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = add <4 x i32> <i32 2, i32 4, i32 6, i32 8>, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %14 = add <4 x i32> <i32 -1, i32 0, i32 2, i32 1>, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %15 = add <4 x i32> <i32 256, i32 4096, i32 57, i32 1>, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;