[llvm] [SystemZ] i128 cost model (PR #78528)

Thu Jan 18 08:36:02 PST 2024

https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/78528

>From 4b80902a18d5467412a7580f096fc8929e8fe98e Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Wed, 17 Jan 2024 17:33:03 -0600
Subject: [PATCH 1/2] Mainline i128 cost model

---
 .../SystemZ/SystemZTargetTransformInfo.cpp    | 133 +++++++++----
 .../SystemZ/SystemZTargetTransformInfo.h      |   2 +
 .../CostModel/SystemZ/i128-cmp-ext-conv.ll    | 185 ++++++++++++++++++
 .../Analysis/CostModel/SystemZ/int-arith.ll   |   6 +
 .../Analysis/CostModel/SystemZ/intrinsics.ll  |   9 +
 .../CostModel/SystemZ/load-to-trunc.ll        |  27 ---
 .../Analysis/CostModel/SystemZ/load_store.ll  |  72 ++++---
 .../Analysis/CostModel/SystemZ/logic-i128.ll  |  48 +++++
 .../Analysis/CostModel/SystemZ/logical.ll     |  12 ++
 9 files changed, 404 insertions(+), 90 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
 delete mode 100644 llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
 create mode 100644 llvm/test/Analysis/CostModel/SystemZ/logic-i128.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index e21d3090ba2fd1..d69ff9e96c3e47 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -75,8 +75,8 @@ InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
   // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
     return TTI::TCC_Free;
-  // No cost model for operations on integers larger than 64 bit implemented yet.
-  if (BitSize > 64)
+  // No cost model for operations on integers larger than 128 bit implemented yet.
+  if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
     return TTI::TCC_Free;
 
   if (Imm == 0)
@@ -96,7 +96,8 @@ InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
     return 2 * TTI::TCC_Basic;
   }
 
-  return 4 * TTI::TCC_Basic;
+  // i128 immediates loads from Constant Pool
+  return 2 * TTI::TCC_Basic;
 }
 
 InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
@@ -466,6 +467,8 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     }
   }
 
+  unsigned ImmLoadCost = 0;
+
   if (!Ty->isVectorTy()) {
     // These FP operations are supported with a dedicated instruction for
     // float, double and fp128 (base implementation assumes float generally
@@ -478,30 +481,43 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     if (Opcode == Instruction::FRem)
       return LIBCALL_COST;
 
+    // Most i128 immediates must be loaded from the constant pool.
+    if (Ty->isIntegerTy(128))
+      for (const Value *A : Args)
+        if (auto *C = dyn_cast<ConstantInt>(A))
+          if (Opcode != Instruction::Xor || !C->isAllOnesValue())
+            ImmLoadCost++;
+
     // Give discount for some combined logical operations if supported.
-    if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
+    if (Args.size() == 2) {
       if (Opcode == Instruction::Xor) {
         for (const Value *A : Args) {
           if (const Instruction *I = dyn_cast<Instruction>(A))
             if (I->hasOneUse() &&
-                (I->getOpcode() == Instruction::And ||
-                 I->getOpcode() == Instruction::Or ||
+                (I->getOpcode() == Instruction::Or ||
+                 I->getOpcode() == Instruction::And ||
                  I->getOpcode() == Instruction::Xor))
-              return 0;
+              if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
+                  (isInt128InVR(Ty) &&
+                   (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
+                return 0 + ImmLoadCost;
         }
       }
-      else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
+      else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
         for (const Value *A : Args) {
           if (const Instruction *I = dyn_cast<Instruction>(A))
-            if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
-              return 0;
+            if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
+                ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
+                 (isInt128InVR(Ty) &&
+                  (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
+              return 0 + ImmLoadCost;
         }
       }
     }
 
     // Or requires one instruction, although it has custom handling for i64.
     if (Opcode == Instruction::Or)
-      return 1;
+      return 1 + ImmLoadCost;
 
     if (Opcode == Instruction::Xor && ScalarBits == 1) {
       if (ST->hasLoadStoreOnCond2())
@@ -589,7 +605,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
 
   // Fallback to the default implementation.
   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
-                                       Args, CxtI);
+                                       Args, CxtI) + ImmLoadCost;
 }
 
 InstructionCost SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
@@ -774,29 +790,63 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     assert (!Dst->isVectorTy());
 
     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
+      if (Src->isIntegerTy(128))
+        return LIBCALL_COST;
       if (SrcScalarBits >= 32 ||
           (I != nullptr && isa<LoadInst>(I->getOperand(0))))
         return 1;
       return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
     }
 
-    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-        Src->isIntegerTy(1)) {
-      if (ST->hasLoadStoreOnCond2())
-        return 2; // li 0; loc 1
-
-      // This should be extension of a compare i1 result, which is done with
-      // ipm and a varying sequence of instructions.
-      unsigned Cost = 0;
-      if (Opcode == Instruction::SExt)
-        Cost = (DstScalarBits < 64 ? 3 : 4);
-      if (Opcode == Instruction::ZExt)
-        Cost = 3;
-      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
-      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
-        // If operands of an fp-type was compared, this costs +1.
-        Cost++;
-      return Cost;
+    if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
+        Dst->isIntegerTy(128))
+      return LIBCALL_COST;
+
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
+      if (Src->isIntegerTy(1)) {
+        if (DstScalarBits == 128)
+          return 5 /*branch seq.*/;
+
+        if (ST->hasLoadStoreOnCond2())
+          return 2; // li 0; loc 1
+
+        // This should be extension of a compare i1 result, which is done with
+        // ipm and a varying sequence of instructions.
+        unsigned Cost = 0;
+        if (Opcode == Instruction::SExt)
+          Cost = (DstScalarBits < 64 ? 3 : 4);
+        if (Opcode == Instruction::ZExt)
+          Cost = 3;
+        Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+        if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+          // If operands of an fp-type was compared, this costs +1.
+          Cost++;
+        return Cost;
+      }
+      else if (isInt128InVR(Dst)) {
+        // Extensions from GPR to i128 (in VR) typically costs two instructions,
+        // but a zero-extending load would be just one extra instruction.
+        if (Opcode == Instruction::ZExt && I != nullptr)
+          if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+            if (Ld->hasOneUse())
+              return 1;
+        return 2;
+      }
+    }
+
+    if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
+      if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+        if (Ld->hasOneUse())
+          return 0;  // Will be converted to GPR load.
+      bool OnlyTruncatingStores = true;
+      for (const User *U : I->users())
+        if (!isa<StoreInst>(U)) {
+          OnlyTruncatingStores = false;
+          break;
+        }
+      if (OnlyTruncatingStores)
+        return 0;
+      return 2; // Vector element extraction.
     }
   }
   else if (ST->hasVector()) {
@@ -930,7 +980,7 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       // A loaded value compared with 0 with multiple users becomes Load and
       // Test. The load is then not foldable, so return 0 cost for the ICmp.
       unsigned ScalarBits = ValTy->getScalarSizeInBits();
-      if (I != nullptr && ScalarBits >= 32)
+      if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
         if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
           if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
             if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
@@ -940,11 +990,21 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       unsigned Cost = 1;
       if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
         Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
+      if (isInt128InVR(ValTy) && I != nullptr &&
+          isa<ConstantInt>(I->getOperand(1)))
+        Cost++;
       return Cost;
     }
     case Instruction::Select:
       if (ValTy->isFloatingPointTy())
         return 4; // No load on condition for FP - costs a conditional jump.
+      if (I != nullptr && isInt128InVR(ValTy)) {
+        unsigned ImmLoadCost = 0;
+        if (isa<ConstantInt>(I->getOperand(1)) ||
+            isa<ConstantInt>(I->getOperand(2)))
+          ImmLoadCost++;
+        return 4 + ImmLoadCost;
+      }
       return 1; // Load On Condition / Select Register.
     }
   }
@@ -1157,6 +1217,15 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                   CostKind);
 
+  // Storing an i128 constant requires load from Constant Pool.
+  if (isInt128InVR(Src) && Opcode == Instruction::Store && I != nullptr &&
+      isa<ConstantInt>(I->getOperand(0)))
+    return 2;
+
+  // FP128 is a legal type but kept in a register pair on older CPUs.
+  if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
+    return 2;
+
   unsigned NumOps =
     (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
 
@@ -1177,10 +1246,6 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     }
   }
 
-  if (Src->getScalarSizeInBits() == 128)
-    // 128 bit scalars are held in a pair of two 64 bit registers.
-    NumOps *= 2;
-
   return  NumOps;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 62c59ddc3f06a3..2cccdf6d17dacf 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -28,6 +28,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
 
   unsigned const LIBCALL_COST = 30;
 
+  bool isInt128InVR(Type *Ty) { return Ty->isIntegerTy(128) && ST->hasVector(); }
+
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
diff --git a/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll b/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
new file mode 100644
index 00000000000000..d3e60c7df51e5a
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
@@ -0,0 +1,185 @@
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+
+define i128 @fun1(i128 %val1, i128 %val2) {
+; CHECK-LABEL: 'fun1'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v128 = sext i1 %cmp to i128
+  %cmp = icmp eq i128 %val1, %val2
+  %v128 = sext i1 %cmp to i128
+  ret i128 %v128
+}
+
+define i128 @fun2(i128 %val1, i128 %val2) {
+; CHECK-LABEL: 'fun2'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v128 = zext i1 %cmp to i128
+  %cmp = icmp eq i128 %val1, %val2
+  %v128 = zext i1 %cmp to i128
+  ret i128 %v128
+}
+
+define i128 @fun3(i128 %val1, i128 %val2,
+                  i128 %val3, i128 %val4) {
+; CHECK-LABEL: 'fun3'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %add = add i128 %val3, %val4
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %sel = select i1 %cmp, i128 %val3, i128 %add
+  %cmp = icmp eq i128 %val1, %val2
+  %add = add i128 %val3, %val4
+  %sel = select i1 %cmp, i128 %val3, i128 %add
+  ret i128 %sel
+}
+
+
+define i128 @fun3_b(i128 %val1) {
+; CHECK-LABEL: 'fun3_b'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %cmp = icmp eq i128 %val1, 123
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %sel = select i1 %cmp, i128 %val1, i128 456
+  %cmp = icmp eq i128 %val1, 123
+  %sel = select i1 %cmp, i128 %val1, i128 456
+  ret i128 %sel
+}
+
+define i128 @fun3_c(i128 %val1) {
+; CHECK-LABEL: 'fun3_c'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %cmp = icmp eq i128 %val1, 123
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %sel = select i1 %cmp, i128 567, i128 456
+  %cmp = icmp eq i128 %val1, 123
+  %sel = select i1 %cmp, i128 567, i128 456
+  ret i128 %sel
+}
+
+define i128 @fun4(ptr %src) {
+; CHECK-LABEL: 'fun4'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = sext i64 %v to i128
+  %v = load i64, ptr %src, align 8
+  %res = sext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun5(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: 'fun5'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = sext i64 %v to i128
+  %v = add i64 %lhs, %rhs
+  %res = sext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun6(ptr %src) {
+; CHECK-LABEL: 'fun6'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res = zext i64 %v to i128
+  %v = load i64, ptr %src, align 8
+  %res = zext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun7(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: 'fun7'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = zext i64 %v to i128
+  %v = add i64 %lhs, %rhs
+  %res = zext i64 %v to i128
+  ret i128 %res
+}
+
+; Truncating store is free.
+define void @fun8(i128 %lhs, i128 %rhs, ptr %dst) {
+; CHECK-LABEL: 'fun8'
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  store i64 %t, ptr %dst, align 8
+  ret void
+}
+
+; If there is a non-store user, an extraction is needed.
+define i64 @fun9(i128 %lhs, i128 %rhs, ptr %dst) {
+; CHECK-LABEL: 'fun9'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  store i64 %t, ptr %dst, align 8
+  ret i64 %t
+}
+
+; Truncation of load is free.
+define i64 @fun10(ptr %src) {
+; CHECK-LABEL: 'fun10'
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %t = trunc i128 %v to i64
+  %v = load i128, ptr %src, align 8
+  %t = trunc i128 %v to i64
+  ret i64 %t
+}
+
+; If the load has another user, the truncation becomes an extract.
+define i64 @fun11(ptr %src, i128 %val2, ptr %dst) {
+; CHECK-LABEL: 'fun11'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = load i128, ptr %src, align 8
+  %t = trunc i128 %v to i64
+  %a = add i128 %v, %val2
+  store i128 %a, ptr %dst
+  ret i64 %t
+}
+
+; Trunction with a GPR use typically requires an extraction.
+define i64 @fun12(i128 %lhs, i128 %rhs) {
+; CHECK-LABEL: 'fun12'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  ret i64 %t
+}
+
+; Fp<->Int conversions require libcalls.
+define void @fun13() {
+; CHECK-LABEL: 'fun13'
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v0 = fptosi fp128 undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v1 = fptosi double undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v2 = fptosi float undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v3 = fptoui fp128 undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v4 = fptoui double undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v5 = fptoui float undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v6 = sitofp i128 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v7 = sitofp i128 undef to double
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v8 = sitofp i128 undef to float
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v9 = uitofp i128 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v10 = uitofp i128 undef to double
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v11 = uitofp i128 undef to float
+  %v0 = fptosi fp128 undef to i128
+  %v1 = fptosi double undef to i128
+  %v2 = fptosi float undef to i128
+  %v3 = fptoui fp128 undef to i128
+  %v4 = fptoui double undef to i128
+  %v5 = fptoui float undef to i128
+  %v6 = sitofp i128 undef to fp128
+  %v7 = sitofp i128 undef to double
+  %v8 = sitofp i128 undef to float
+  %v9 = uitofp i128 undef to fp128
+  %v10 = uitofp i128 undef to double
+  %v11 = uitofp i128 undef to float
+  ret void
+}
+
+; All i128 immediates (big and small) are loaded from the constant pool.
+define void @fun14(ptr %dst) {
+; CHECK-LABEL: 'fun14'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store i128 166153499473114484112, ptr %dst, align 8
+  store i128 166153499473114484112, ptr %dst, align 8
+  ret void
+}
+
+define void @fun15(ptr %dst) {
+; CHECK-LABEL: 'fun15'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store i128 123, ptr %dst, align 8
+  store i128 123, ptr %dst, align 8
+  ret void
+}
+
+define void @fun16(ptr %dst, i128 %val1) {
+; CHECK-LABEL: 'fun16'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = add i128 %val1, 123
+  %res = add i128 %val1, 123
+  store i128 %res, ptr %dst, align 8
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll b/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
index 71863b923ca38b..fc4d19c5cdf9e5 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
@@ -8,6 +8,7 @@ define void @add() {
   %res1 = add i16 undef, undef
   %res2 = add i32 undef, undef
   %res3 = add i64 undef, undef
+  %resQ = add i128 undef, undef
   %res4 = add <2 x i8> undef, undef
   %res5 = add <2 x i16> undef, undef
   %res6 = add <2 x i32> undef, undef
@@ -29,6 +30,7 @@ define void @add() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = add i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = add i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = add i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = add i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = add <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = add <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = add <2 x i32> undef, undef
@@ -54,6 +56,7 @@ define void @sub() {
   %res1 = sub i16 undef, undef
   %res2 = sub i32 undef, undef
   %res3 = sub i64 undef, undef
+  %resQ = sub i128 undef, undef
   %res4 = sub <2 x i8> undef, undef
   %res5 = sub <2 x i16> undef, undef
   %res6 = sub <2 x i32> undef, undef
@@ -75,6 +78,7 @@ define void @sub() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = sub i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = sub i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = sub i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = sub i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = sub <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = sub <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = sub <2 x i32> undef, undef
@@ -100,6 +104,7 @@ define void @mul() {
   %res1 = mul i16 undef, undef
   %res2 = mul i32 undef, undef
   %res3 = mul i64 undef, undef
+  %resQ = mul i128 undef, undef
   %res4 = mul <2 x i8> undef, undef
   %res5 = mul <2 x i16> undef, undef
   %res6 = mul <2 x i32> undef, undef
@@ -121,6 +126,7 @@ define void @mul() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = mul i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = mul i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = mul i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = mul i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = mul <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = mul <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = mul <2 x i32> undef, undef
diff --git a/llvm/test/Analysis/CostModel/SystemZ/intrinsics.ll b/llvm/test/Analysis/CostModel/SystemZ/intrinsics.ll
index d3e07fa9735b32..032b78099c5712 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/intrinsics.ll
@@ -3,6 +3,13 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z15 \
 ; RUN:  | FileCheck %s -check-prefixes=CHECK,Z15
 
+define void @bswap_i128(i128 %arg) {
+; CHECK: function 'bswap_i128'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp = tail call i128 @llvm.bswap.i128(i128 %arg)
+  %swp = tail call i128 @llvm.bswap.i128(i128 %arg)
+  ret void
+}
+
 define void @bswap_i64(i64 %arg, <2 x i64> %arg2) {
 ; CHECK: function 'bswap_i64'
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp1 = tail call i64
@@ -186,6 +193,8 @@ define void @bswap_v8i16_mem(ptr %src, <8 x i16> %arg, ptr %dst) {
   ret void
 }
 
+declare i128 @llvm.bswap.i128(i128)
+
 declare i64 @llvm.bswap.i64(i64)
 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
diff --git a/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll b/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
deleted file mode 100644
index cd6af575ea9ec3..00000000000000
--- a/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; Check memory cost model action for a load of an unusually sized integer
-; follow by and a trunc to a register sized integer gives a cost of 1 rather
-; than the expanded cost if it is not.  This target does not currently perform
-; the expansion in the cost modelling.
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=systemz-unknown < %s | FileCheck %s --check-prefix=CHECK
-
-; Check that cost is 1 for unusual load to register sized load.
-define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
-; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
-;
-  %out = load i128, ptr %ptr
-  %trunc = trunc i128 %out to i32
-  ret i32 %trunc
-}
-
-define i128 @loadUnusualInteger(ptr %ptr) {
-; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
-;
-  %out = load i128, ptr %ptr
-  ret i128 %out
-}
diff --git a/llvm/test/Analysis/CostModel/SystemZ/load_store.ll b/llvm/test/Analysis/CostModel/SystemZ/load_store.ll
index 1766dd3b2859e6..4d36c9ed421e08 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/load_store.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/load_store.ll
@@ -1,10 +1,13 @@
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=zEC12 | FileCheck %s --check-prefixes=CHECK,ZEC12
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s --check-prefixes=CHECK,Z13
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z14 | FileCheck %s --check-prefixes=CHECK,Z14
 
 define void @store() {
   store i8 undef, ptr undef
   store i16 undef, ptr undef
   store i32 undef, ptr undef
   store i64 undef, ptr undef
+  store i128 undef, ptr undef
   store float undef, ptr undef
   store double undef, ptr undef
   store fp128 undef, ptr undef
@@ -37,9 +40,14 @@ define void @store() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i16 undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i32 undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i64 undef, ptr undef
+; ZEC12: Cost Model: Found an estimated cost of 2 for instruction:   store i128 undef, ptr undef
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   store i128 undef, ptr undef
+; Z14:   Cost Model: Found an estimated cost of 1 for instruction:   store i128 undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store float undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store double undef, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store fp128 undef, ptr undef
+; ZEC12: Cost Model: Found an estimated cost of 2 for instruction:   store fp128 undef, ptr undef
+; Z13:   Cost Model: Found an estimated cost of 2 for instruction:   store fp128 undef, ptr undef
+; Z14:   Cost Model: Found an estimated cost of 1 for instruction:   store fp128 undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i8> undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i16> undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i32> undef, ptr undef
@@ -73,6 +81,7 @@ define void @load() {
   load i16, ptr undef
   load i32, ptr undef
   load i64, ptr undef
+  load i128, ptr undef
   load float, ptr undef
   load double, ptr undef
   load fp128, ptr undef
@@ -105,33 +114,38 @@ define void @load() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = load i16, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = load i32, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = load i64, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = load float, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = load double, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %7 = load fp128, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = load <2 x i8>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = load <2 x i16>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = load <2 x i32>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %11 = load <2 x i64>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %12 = load <2 x float>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %13 = load <2 x double>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %14 = load <4 x i8>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %15 = load <4 x i16>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %16 = load <4 x i32>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %17 = load <4 x i64>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %18 = load <4 x float>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %19 = load <4 x double>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %20 = load <8 x i8>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %21 = load <8 x i16>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %22 = load <8 x i32>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %23 = load <8 x i64>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %24 = load <8 x float>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %25 = load <8 x double>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %26 = load <16 x i8>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %27 = load <16 x i16>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %28 = load <16 x i32>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %29 = load <16 x i64>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %30 = load <16 x float>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %31 = load <16 x double>, ptr undef
+; ZEC12: Cost Model: Found an estimated cost of 2 for instruction:   %5 = load i128, ptr undef
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   %5 = load i128, ptr undef
+; Z14:   Cost Model: Found an estimated cost of 1 for instruction:   %5 = load i128, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = load float, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = load double, ptr undef
+; ZEC12: Cost Model: Found an estimated cost of 2 for instruction:   %8 = load fp128, ptr undef
+; Z13:   Cost Model: Found an estimated cost of 2 for instruction:   %8 = load fp128, ptr undef
+; Z14:   Cost Model: Found an estimated cost of 1 for instruction:   %8 = load fp128, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = load <2 x i8>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = load <2 x i16>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %11 = load <2 x i32>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %12 = load <2 x i64>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %13 = load <2 x float>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %14 = load <2 x double>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %15 = load <4 x i8>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %16 = load <4 x i16>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %17 = load <4 x i32>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %18 = load <4 x i64>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %19 = load <4 x float>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %20 = load <4 x double>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %21 = load <8 x i8>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %22 = load <8 x i16>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %23 = load <8 x i32>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %24 = load <8 x i64>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %25 = load <8 x float>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %26 = load <8 x double>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %27 = load <16 x i8>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %28 = load <16 x i16>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %29 = load <16 x i32>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %30 = load <16 x i64>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %31 = load <16 x float>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %32 = load <16 x double>, ptr undef
 
   ret void;
 }
diff --git a/llvm/test/Analysis/CostModel/SystemZ/logic-i128.ll b/llvm/test/Analysis/CostModel/SystemZ/logic-i128.ll
new file mode 100644
index 00000000000000..f4c4fceed717f3
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/logic-i128.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 \
+; RUN:  | FileCheck %s -check-prefixes=CHECK,Z13
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z14 \
+; RUN:  | FileCheck %s -check-prefixes=CHECK,Z14
+
+define void @fun(i128 %a)  {
+; CHECK-LABEL: 'fun'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %c0 = xor i128 %l0, -1
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction: %res0 = or i128 %a, %c0
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction: %res0 = or i128 %a, %c0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %c1 = xor i128 %l1, -1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %res1 = and i128 %a, %c1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %c2 = and i128 %l2, %a
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction: %res2 = xor i128 %c2, -1
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction: %res2 = xor i128 %c2, -1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %c3 = or i128 %l3, %a
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %res3 = xor i128 %c3, -1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %c4 = xor i128 %l4, %a
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction: %res4 = xor i128 %c4, -1
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction: %res4 = xor i128 %c4, -1
+;
+  %l0 = load i128, ptr undef
+  %c0 = xor i128 %l0, -1
+  %res0 = or i128 %a, %c0
+  store i128 %res0, ptr undef
+
+  %l1 = load i128, ptr undef
+  %c1 = xor i128 %l1, -1
+  %res1 = and i128 %a, %c1
+  store i128 %res1, ptr undef
+
+  %l2 = load i128, ptr undef
+  %c2 = and i128 %l2, %a
+  %res2 = xor i128 %c2, -1
+  store i128 %res2, ptr undef
+
+  %l3 = load i128, ptr undef
+  %c3 = or i128 %l3, %a
+  %res3 = xor i128 %c3, -1
+  store i128 %res3, ptr undef
+
+  %l4 = load i128, ptr undef
+  %c4 = xor i128 %l4, %a
+  %res4 = xor i128 %c4, -1
+  store i128 %res4, ptr undef
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/SystemZ/logical.ll b/llvm/test/Analysis/CostModel/SystemZ/logical.ll
index 29935d6895fc05..c87a3836ded6bd 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/logical.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/logical.ll
@@ -5,6 +5,7 @@ define void @and() {
   %res1 = and i16 undef, undef
   %res2 = and i32 undef, undef
   %res3 = and i64 undef, undef
+  %resQ = and i128 undef, undef
   %res4 = and <2 x i8> undef, undef
   %res5 = and <2 x i16> undef, undef
   %res6 = and <2 x i32> undef, undef
@@ -26,6 +27,7 @@ define void @and() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = and i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = and i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = and i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = and i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = and <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = and <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = and <2 x i32> undef, undef
@@ -51,6 +53,7 @@ define void @ashr() {
   %res1 = ashr i16 undef, undef
   %res2 = ashr i32 undef, undef
   %res3 = ashr i64 undef, undef
+  %resQ = ashr i128 undef, undef
   %res4 = ashr <2 x i8> undef, undef
   %res5 = ashr <2 x i16> undef, undef
   %res6 = ashr <2 x i32> undef, undef
@@ -72,6 +75,7 @@ define void @ashr() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = ashr i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = ashr i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = ashr i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = ashr i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = ashr <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = ashr <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = ashr <2 x i32> undef, undef
@@ -97,6 +101,7 @@ define void @lshr() {
   %res1 = lshr i16 undef, undef
   %res2 = lshr i32 undef, undef
   %res3 = lshr i64 undef, undef
+  %resQ = lshr i128 undef, undef
   %res4 = lshr <2 x i8> undef, undef
   %res5 = lshr <2 x i16> undef, undef
   %res6 = lshr <2 x i32> undef, undef
@@ -118,6 +123,7 @@ define void @lshr() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = lshr i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = lshr i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = lshr i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = lshr i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = lshr <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = lshr <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = lshr <2 x i32> undef, undef
@@ -143,6 +149,7 @@ define void @or() {
   %res1 = or i16 undef, undef
   %res2 = or i32 undef, undef
   %res3 = or i64 undef, undef
+  %resQ = or i128 undef, undef
   %res4 = or <2 x i8> undef, undef
   %res5 = or <2 x i16> undef, undef
   %res6 = or <2 x i32> undef, undef
@@ -164,6 +171,7 @@ define void @or() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = or i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = or i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = or i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = or i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = or <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = or <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = or <2 x i32> undef, undef
@@ -189,6 +197,7 @@ define void @shl() {
   %res1 = shl i16 undef, undef
   %res2 = shl i32 undef, undef
   %res3 = shl i64 undef, undef
+  %resQ = shl i128 undef, undef
   %res4 = shl <2 x i8> undef, undef
   %res5 = shl <2 x i16> undef, undef
   %res6 = shl <2 x i32> undef, undef
@@ -210,6 +219,7 @@ define void @shl() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = shl i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = shl i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = shl i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = shl i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = shl <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = shl <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = shl <2 x i32> undef, undef
@@ -235,6 +245,7 @@ define void @xor() {
   %res1 = xor i16 undef, undef
   %res2 = xor i32 undef, undef
   %res3 = xor i64 undef, undef
+  %resQ = xor i128 undef, undef
   %res4 = xor <2 x i8> undef, undef
   %res5 = xor <2 x i16> undef, undef
   %res6 = xor <2 x i32> undef, undef
@@ -256,6 +267,7 @@ define void @xor() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = xor i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = xor i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = xor i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = xor i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = xor <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = xor <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = xor <2 x i32> undef, undef

>From c0c60f136cbd4e0f52ab00a7a18341b6cd5d4d68 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Thu, 18 Jan 2024 10:34:01 -0600
Subject: [PATCH 2/2] Remove the i128 immediate costs for now.

---
 .../SystemZ/SystemZTargetTransformInfo.cpp    | 36 +++-------------
 .../CostModel/SystemZ/i128-cmp-ext-conv.ll    | 42 -------------------
 2 files changed, 6 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index d69ff9e96c3e47..9370fb51a96c56 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -467,8 +467,6 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     }
   }
 
-  unsigned ImmLoadCost = 0;
-
   if (!Ty->isVectorTy()) {
     // These FP operations are supported with a dedicated instruction for
     // float, double and fp128 (base implementation assumes float generally
@@ -481,13 +479,6 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
     if (Opcode == Instruction::FRem)
       return LIBCALL_COST;
 
-    // Most i128 immediates must be loaded from the constant pool.
-    if (Ty->isIntegerTy(128))
-      for (const Value *A : Args)
-        if (auto *C = dyn_cast<ConstantInt>(A))
-          if (Opcode != Instruction::Xor || !C->isAllOnesValue())
-            ImmLoadCost++;
-
     // Give discount for some combined logical operations if supported.
     if (Args.size() == 2) {
       if (Opcode == Instruction::Xor) {
@@ -500,7 +491,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
               if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
                   (isInt128InVR(Ty) &&
                    (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
-                return 0 + ImmLoadCost;
+                return 0;
         }
       }
       else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
@@ -510,14 +501,14 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
                 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
                  (isInt128InVR(Ty) &&
                   (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
-              return 0 + ImmLoadCost;
+              return 0;
         }
       }
     }
 
     // Or requires one instruction, although it has custom handling for i64.
     if (Opcode == Instruction::Or)
-      return 1 + ImmLoadCost;
+      return 1;
 
     if (Opcode == Instruction::Xor && ScalarBits == 1) {
       if (ST->hasLoadStoreOnCond2())
@@ -605,7 +596,7 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
 
   // Fallback to the default implementation.
   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
-                                       Args, CxtI) + ImmLoadCost;
+                                       Args, CxtI);
 }
 
 InstructionCost SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
@@ -990,21 +981,11 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       unsigned Cost = 1;
       if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
         Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
-      if (isInt128InVR(ValTy) && I != nullptr &&
-          isa<ConstantInt>(I->getOperand(1)))
-        Cost++;
       return Cost;
     }
     case Instruction::Select:
-      if (ValTy->isFloatingPointTy())
-        return 4; // No load on condition for FP - costs a conditional jump.
-      if (I != nullptr && isInt128InVR(ValTy)) {
-        unsigned ImmLoadCost = 0;
-        if (isa<ConstantInt>(I->getOperand(1)) ||
-            isa<ConstantInt>(I->getOperand(2)))
-          ImmLoadCost++;
-        return 4 + ImmLoadCost;
-      }
+      if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy))
+        return 4; // No LOC for FP / i128 - costs a conditional jump.
       return 1; // Load On Condition / Select Register.
     }
   }
@@ -1217,11 +1198,6 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                   CostKind);
 
-  // Storing an i128 constant requires load from Constant Pool.
-  if (isInt128InVR(Src) && Opcode == Instruction::Store && I != nullptr &&
-      isa<ConstantInt>(I->getOperand(0)))
-    return 2;
-
   // FP128 is a legal type but kept in a register pair on older CPUs.
   if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
     return 2;
diff --git a/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll b/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
index d3e60c7df51e5a..66da6de3bc7681 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
@@ -31,25 +31,6 @@ define i128 @fun3(i128 %val1, i128 %val2,
   ret i128 %sel
 }
 
-
-define i128 @fun3_b(i128 %val1) {
-; CHECK-LABEL: 'fun3_b'
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %cmp = icmp eq i128 %val1, 123
-; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %sel = select i1 %cmp, i128 %val1, i128 456
-  %cmp = icmp eq i128 %val1, 123
-  %sel = select i1 %cmp, i128 %val1, i128 456
-  ret i128 %sel
-}
-
-define i128 @fun3_c(i128 %val1) {
-; CHECK-LABEL: 'fun3_c'
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %cmp = icmp eq i128 %val1, 123
-; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %sel = select i1 %cmp, i128 567, i128 456
-  %cmp = icmp eq i128 %val1, 123
-  %sel = select i1 %cmp, i128 567, i128 456
-  ret i128 %sel
-}
-
 define i128 @fun4(ptr %src) {
 ; CHECK-LABEL: 'fun4'
 ; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = sext i64 %v to i128
@@ -160,26 +141,3 @@ define void @fun13() {
   %v11 = uitofp i128 undef to float
   ret void
 }
-
-; All i128 immediates (big and small) are loaded from the constant pool.
-define void @fun14(ptr %dst) {
-; CHECK-LABEL: 'fun14'
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store i128 166153499473114484112, ptr %dst, align 8
-  store i128 166153499473114484112, ptr %dst, align 8
-  ret void
-}
-
-define void @fun15(ptr %dst) {
-; CHECK-LABEL: 'fun15'
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store i128 123, ptr %dst, align 8
-  store i128 123, ptr %dst, align 8
-  ret void
-}
-
-define void @fun16(ptr %dst, i128 %val1) {
-; CHECK-LABEL: 'fun16'
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = add i128 %val1, 123
-  %res = add i128 %val1, 123
-  store i128 %res, ptr %dst, align 8
-  ret void
-}