[llvm] [SystemZ] i128 cost model (PR #78528)

Wed Jan 17 17:13:58 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-analysis

Author: Jonas Paulsson (JonPsson1)

<details>
<summary>Changes</summary>

Update SystemZTTI to reflect the recent change of handling i128 as a legal type in vector registers.


---

Patch is 41.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78528.diff


9 Files Affected:

- (modified) llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp (+99-34) 
- (modified) llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h (+2) 
- (added) llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll (+185) 
- (modified) llvm/test/Analysis/CostModel/SystemZ/int-arith.ll (+6) 
- (modified) llvm/test/Analysis/CostModel/SystemZ/intrinsics.ll (+9) 
- (removed) llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll (-27) 
- (modified) llvm/test/Analysis/CostModel/SystemZ/load_store.ll (+43-29) 
- (added) llvm/test/Analysis/CostModel/SystemZ/logic-i128.ll (+48) 
- (modified) llvm/test/Analysis/CostModel/SystemZ/logical.ll (+12) 


``````````diff

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index e21d3090ba2fd1a..d69ff9e96c3e473 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -75,8 +75,8 @@ InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
   // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
     return TTI::TCC_Free;
-  // No cost model for operations on integers larger than 64 bit implemented yet.
-  if (BitSize > 64)
+  // No cost model for operations on integers larger than 128 bit implemented yet.
+  if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
     return TTI::TCC_Free;
 
   if (Imm == 0)
@@ -96,7 +96,8 @@ InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
     return 2 * TTI::TCC_Basic;
   }
 
-  return 4 * TTI::TCC_Basic;
+  // i128 immediates loads from Constant Pool
+  return 2 * TTI::TCC_Basic;
 }
 
 InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
@@ -466,6 +467,8 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     }
   }
 
+  unsigned ImmLoadCost = 0;
+
   if (!Ty->isVectorTy()) {
     // These FP operations are supported with a dedicated instruction for
     // float, double and fp128 (base implementation assumes float generally
@@ -478,30 +481,43 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     if (Opcode == Instruction::FRem)
       return LIBCALL_COST;
 
+    // Most i128 immediates must be loaded from the constant pool.
+    if (Ty->isIntegerTy(128))
+      for (const Value *A : Args)
+        if (auto *C = dyn_cast<ConstantInt>(A))
+          if (Opcode != Instruction::Xor || !C->isAllOnesValue())
+            ImmLoadCost++;
+
     // Give discount for some combined logical operations if supported.
-    if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
+    if (Args.size() == 2) {
       if (Opcode == Instruction::Xor) {
         for (const Value *A : Args) {
           if (const Instruction *I = dyn_cast<Instruction>(A))
             if (I->hasOneUse() &&
-                (I->getOpcode() == Instruction::And ||
-                 I->getOpcode() == Instruction::Or ||
+                (I->getOpcode() == Instruction::Or ||
+                 I->getOpcode() == Instruction::And ||
                  I->getOpcode() == Instruction::Xor))
-              return 0;
+              if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
+                  (isInt128InVR(Ty) &&
+                   (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
+                return 0 + ImmLoadCost;
         }
       }
-      else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
+      else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
         for (const Value *A : Args) {
           if (const Instruction *I = dyn_cast<Instruction>(A))
-            if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
-              return 0;
+            if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
+                ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
+                 (isInt128InVR(Ty) &&
+                  (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
+              return 0 + ImmLoadCost;
         }
       }
     }
 
     // Or requires one instruction, although it has custom handling for i64.
     if (Opcode == Instruction::Or)
-      return 1;
+      return 1 + ImmLoadCost;
 
     if (Opcode == Instruction::Xor && ScalarBits == 1) {
       if (ST->hasLoadStoreOnCond2())
@@ -589,7 +605,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
 
   // Fallback to the default implementation.
   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
-                                       Args, CxtI);
+                                       Args, CxtI) + ImmLoadCost;
 }
 
 InstructionCost SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
@@ -774,29 +790,63 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     assert (!Dst->isVectorTy());
 
     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
+      if (Src->isIntegerTy(128))
+        return LIBCALL_COST;
       if (SrcScalarBits >= 32 ||
           (I != nullptr && isa<LoadInst>(I->getOperand(0))))
         return 1;
       return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
     }
 
-    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-        Src->isIntegerTy(1)) {
-      if (ST->hasLoadStoreOnCond2())
-        return 2; // li 0; loc 1
-
-      // This should be extension of a compare i1 result, which is done with
-      // ipm and a varying sequence of instructions.
-      unsigned Cost = 0;
-      if (Opcode == Instruction::SExt)
-        Cost = (DstScalarBits < 64 ? 3 : 4);
-      if (Opcode == Instruction::ZExt)
-        Cost = 3;
-      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
-      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
-        // If operands of an fp-type was compared, this costs +1.
-        Cost++;
-      return Cost;
+    if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
+        Dst->isIntegerTy(128))
+      return LIBCALL_COST;
+
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
+      if (Src->isIntegerTy(1)) {
+        if (DstScalarBits == 128)
+          return 5 /*branch seq.*/;
+
+        if (ST->hasLoadStoreOnCond2())
+          return 2; // li 0; loc 1
+
+        // This should be extension of a compare i1 result, which is done with
+        // ipm and a varying sequence of instructions.
+        unsigned Cost = 0;
+        if (Opcode == Instruction::SExt)
+          Cost = (DstScalarBits < 64 ? 3 : 4);
+        if (Opcode == Instruction::ZExt)
+          Cost = 3;
+        Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+        if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+          // If operands of an fp-type was compared, this costs +1.
+          Cost++;
+        return Cost;
+      }
+      else if (isInt128InVR(Dst)) {
+        // Extensions from GPR to i128 (in VR) typically costs two instructions,
+        // but a zero-extending load would be just one extra instruction.
+        if (Opcode == Instruction::ZExt && I != nullptr)
+          if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+            if (Ld->hasOneUse())
+              return 1;
+        return 2;
+      }
+    }
+
+    if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
+      if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+        if (Ld->hasOneUse())
+          return 0;  // Will be converted to GPR load.
+      bool OnlyTruncatingStores = true;
+      for (const User *U : I->users())
+        if (!isa<StoreInst>(U)) {
+          OnlyTruncatingStores = false;
+          break;
+        }
+      if (OnlyTruncatingStores)
+        return 0;
+      return 2; // Vector element extraction.
     }
   }
   else if (ST->hasVector()) {
@@ -930,7 +980,7 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       // A loaded value compared with 0 with multiple users becomes Load and
       // Test. The load is then not foldable, so return 0 cost for the ICmp.
       unsigned ScalarBits = ValTy->getScalarSizeInBits();
-      if (I != nullptr && ScalarBits >= 32)
+      if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
         if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
           if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
             if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
@@ -940,11 +990,21 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       unsigned Cost = 1;
       if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
         Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
+      if (isInt128InVR(ValTy) && I != nullptr &&
+          isa<ConstantInt>(I->getOperand(1)))
+        Cost++;
       return Cost;
     }
     case Instruction::Select:
       if (ValTy->isFloatingPointTy())
         return 4; // No load on condition for FP - costs a conditional jump.
+      if (I != nullptr && isInt128InVR(ValTy)) {
+        unsigned ImmLoadCost = 0;
+        if (isa<ConstantInt>(I->getOperand(1)) ||
+            isa<ConstantInt>(I->getOperand(2)))
+          ImmLoadCost++;
+        return 4 + ImmLoadCost;
+      }
       return 1; // Load On Condition / Select Register.
     }
   }
@@ -1157,6 +1217,15 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                   CostKind);
 
+  // Storing an i128 constant requires load from Constant Pool.
+  if (isInt128InVR(Src) && Opcode == Instruction::Store && I != nullptr &&
+      isa<ConstantInt>(I->getOperand(0)))
+    return 2;
+
+  // FP128 is a legal type but kept in a register pair on older CPUs.
+  if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
+    return 2;
+
   unsigned NumOps =
     (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
 
@@ -1177,10 +1246,6 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     }
   }
 
-  if (Src->getScalarSizeInBits() == 128)
-    // 128 bit scalars are held in a pair of two 64 bit registers.
-    NumOps *= 2;
-
   return  NumOps;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 62c59ddc3f06a39..2cccdf6d17dacf4 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -28,6 +28,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
 
   unsigned const LIBCALL_COST = 30;
 
+  bool isInt128InVR(Type *Ty) { return Ty->isIntegerTy(128) && ST->hasVector(); }
+
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
diff --git a/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll b/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
new file mode 100644
index 000000000000000..d3e60c7df51e5a1
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
@@ -0,0 +1,185 @@
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+
+define i128 @fun1(i128 %val1, i128 %val2) {
+; CHECK-LABEL: 'fun1'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v128 = sext i1 %cmp to i128
+  %cmp = icmp eq i128 %val1, %val2
+  %v128 = sext i1 %cmp to i128
+  ret i128 %v128
+}
+
+define i128 @fun2(i128 %val1, i128 %val2) {
+; CHECK-LABEL: 'fun2'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v128 = zext i1 %cmp to i128
+  %cmp = icmp eq i128 %val1, %val2
+  %v128 = zext i1 %cmp to i128
+  ret i128 %v128
+}
+
+define i128 @fun3(i128 %val1, i128 %val2,
+                  i128 %val3, i128 %val4) {
+; CHECK-LABEL: 'fun3'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %add = add i128 %val3, %val4
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %sel = select i1 %cmp, i128 %val3, i128 %add
+  %cmp = icmp eq i128 %val1, %val2
+  %add = add i128 %val3, %val4
+  %sel = select i1 %cmp, i128 %val3, i128 %add
+  ret i128 %sel
+}
+
+
+define i128 @fun3_b(i128 %val1) {
+; CHECK-LABEL: 'fun3_b'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %cmp = icmp eq i128 %val1, 123
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %sel = select i1 %cmp, i128 %val1, i128 456
+  %cmp = icmp eq i128 %val1, 123
+  %sel = select i1 %cmp, i128 %val1, i128 456
+  ret i128 %sel
+}
+
+define i128 @fun3_c(i128 %val1) {
+; CHECK-LABEL: 'fun3_c'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %cmp = icmp eq i128 %val1, 123
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %sel = select i1 %cmp, i128 567, i128 456
+  %cmp = icmp eq i128 %val1, 123
+  %sel = select i1 %cmp, i128 567, i128 456
+  ret i128 %sel
+}
+
+define i128 @fun4(ptr %src) {
+; CHECK-LABEL: 'fun4'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = sext i64 %v to i128
+  %v = load i64, ptr %src, align 8
+  %res = sext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun5(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: 'fun5'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = sext i64 %v to i128
+  %v = add i64 %lhs, %rhs
+  %res = sext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun6(ptr %src) {
+; CHECK-LABEL: 'fun6'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res = zext i64 %v to i128
+  %v = load i64, ptr %src, align 8
+  %res = zext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun7(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: 'fun7'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = zext i64 %v to i128
+  %v = add i64 %lhs, %rhs
+  %res = zext i64 %v to i128
+  ret i128 %res
+}
+
+; Truncating store is free.
+define void @fun8(i128 %lhs, i128 %rhs, ptr %dst) {
+; CHECK-LABEL: 'fun8'
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  store i64 %t, ptr %dst, align 8
+  ret void
+}
+
+; If there is a non-store user, an extraction is needed.
+define i64 @fun9(i128 %lhs, i128 %rhs, ptr %dst) {
+; CHECK-LABEL: 'fun9'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  store i64 %t, ptr %dst, align 8
+  ret i64 %t
+}
+
+; Truncation of load is free.
+define i64 @fun10(ptr %src) {
+; CHECK-LABEL: 'fun10'
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %t = trunc i128 %v to i64
+  %v = load i128, ptr %src, align 8
+  %t = trunc i128 %v to i64
+  ret i64 %t
+}
+
+; If the load has another user, the truncation becomes an extract.
+define i64 @fun11(ptr %src, i128 %val2, ptr %dst) {
+; CHECK-LABEL: 'fun11'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = load i128, ptr %src, align 8
+  %t = trunc i128 %v to i64
+  %a = add i128 %v, %val2
+  store i128 %a, ptr %dst
+  ret i64 %t
+}
+
+; Trunction with a GPR use typically requires an extraction.
+define i64 @fun12(i128 %lhs, i128 %rhs) {
+; CHECK-LABEL: 'fun12'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  ret i64 %t
+}
+
+; Fp<->Int conversions require libcalls.
+define void @fun13() {
+; CHECK-LABEL: 'fun13'
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v0 = fptosi fp128 undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v1 = fptosi double undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v2 = fptosi float undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v3 = fptoui fp128 undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v4 = fptoui double undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v5 = fptoui float undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v6 = sitofp i128 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v7 = sitofp i128 undef to double
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v8 = sitofp i128 undef to float
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v9 = uitofp i128 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v10 = uitofp i128 undef to double
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v11 = uitofp i128 undef to float
+  %v0 = fptosi fp128 undef to i128
+  %v1 = fptosi double undef to i128
+  %v2 = fptosi float undef to i128
+  %v3 = fptoui fp128 undef to i128
+  %v4 = fptoui double undef to i128
+  %v5 = fptoui float undef to i128
+  %v6 = sitofp i128 undef to fp128
+  %v7 = sitofp i128 undef to double
+  %v8 = sitofp i128 undef to float
+  %v9 = uitofp i128 undef to fp128
+  %v10 = uitofp i128 undef to double
+  %v11 = uitofp i128 undef to float
+  ret void
+}
+
+; All i128 immediates (big and small) are loaded from the constant pool.
+define void @fun14(ptr %dst) {
+; CHECK-LABEL: 'fun14'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store i128 166153499473114484112, ptr %dst, align 8
+  store i128 166153499473114484112, ptr %dst, align 8
+  ret void
+}
+
+define void @fun15(ptr %dst) {
+; CHECK-LABEL: 'fun15'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store i128 123, ptr %dst, align 8
+  store i128 123, ptr %dst, align 8
+  ret void
+}
+
+define void @fun16(ptr %dst, i128 %val1) {
+; CHECK-LABEL: 'fun16'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = add i128 %val1, 123
+  %res = add i128 %val1, 123
+  store i128 %res, ptr %dst, align 8
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll b/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
index 71863b923ca38b3..fc4d19c5cdf9e58 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
@@ -8,6 +8,7 @@ define void @add() {
   %res1 = add i16 undef, undef
   %res2 = add i32 undef, undef
   %res3 = add i64 undef, undef
+  %resQ = add i128 undef, undef
   %res4 = add <2 x i8> undef, undef
   %res5 = add <2 x i16> undef, undef
   %res6 = add <2 x i32> undef, undef
@@ -29,6 +30,7 @@ define void @add() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = add i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = add i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = add i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = add i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = add <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = add <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = add <2 x i32> undef, undef
@@ -54,6 +56,7 @@ define void @sub() {
   %res1 = sub i16 undef, undef
   %res2 = sub i32 undef, undef
   %res3 = sub i64 undef, undef
+  %resQ = sub i128 undef, undef
   %res4 = sub <2 x i8> undef, undef
   %res5 = sub <2 x i16> undef, undef
   %res6 = sub <2 x i32> undef, undef
@@ -75,6 +78,7 @@ define void @sub() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = sub i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = sub i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = sub i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = sub i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = sub <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = sub <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = sub <2 x i32> undef, undef
@@ -100,6 +104,7 @@ define void @mul() {
   %res1 = mul i16 undef, undef
   %res2 = mul i32 undef, undef
   %res3 = mul i64 undef, undef
+  %resQ = mul i128 undef, undef
   %res4 = mul <2 x ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/78528