[llvm] a6d3bec - [TTI][ARM][MVE] Refine gather/scatter cost model

Wed Mar 11 03:23:56 PDT 2020

Author: Anna Welker
Date: 2020-03-11T10:23:41Z
New Revision: a6d3bec83fca0568e1fb02b9297b43435b9579d6

URL: https://github.com/llvm/llvm-project/commit/a6d3bec83fca0568e1fb02b9297b43435b9579d6
DIFF: https://github.com/llvm/llvm-project/commit/a6d3bec83fca0568e1fb02b9297b43435b9579d6.diff

LOG: [TTI][ARM][MVE] Refine gather/scatter cost model

Refines the gather/scatter cost model, but also changes the TTI
function getIntrinsicInstrCost to accept an additional parameter
which is needed for the gather/scatter cost evaluation.
This did require trivial changes in some non-ARM backends to
adopt the new parameter.
Extending gathers and truncating scatters are now priced cheaper.

Differential Revision: https://reviews.llvm.org/D75525

Added: 
    

Modified: 
    llvm/include/llvm/Analysis/TargetTransformInfo.h
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/include/llvm/CodeGen/BasicTTIImpl.h
    llvm/lib/Analysis/TargetTransformInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
    llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
    llvm/lib/Target/ARM/ARMTargetTransformInfo.h
    llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
    llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
    llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
    llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
    llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
    llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/lib/Target/X86/X86TargetTransformInfo.h
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 2968a5f37a46..1387b90307f7 100644

--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -966,8 +966,11 @@ class TargetTransformInfo {
   /// \p VariableMask - true when the memory access is predicated with a mask
   ///                   that is not a compile-time constant
   /// \p Alignment - alignment of single element
+  /// \p I - the optional original context instruction, if one exists, e.g. the
+  ///        load/store to transform or the call to the gather/scatter intrinsic
   int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
-                             bool VariableMask, unsigned Alignment) const;
+                             bool VariableMask, unsigned Alignment,
+                             const Instruction *I = nullptr) const;
 
   /// \return The cost of the interleaved memory operation.
   /// \p Opcode is the memory operation code
@@ -1006,16 +1009,22 @@ class TargetTransformInfo {
   /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
   /// Three cases are handled: 1. scalar instruction 2. vector instruction
   /// 3. scalar instruction which is to be vectorized with VF.
+  /// I is the optional original context instruction holding the call to the
+  /// intrinsic
   int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                             ArrayRef<Value *> Args, FastMathFlags FMF,
-                            unsigned VF = 1) const;
+                            unsigned VF = 1,
+                            const Instruction *I = nullptr) const;
 
   /// \returns The cost of Intrinsic instructions. Types analysis only.
   /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the
   /// arguments and the return value will be computed based on types.
-  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                            ArrayRef<Type *> Tys, FastMathFlags FMF,
-                            unsigned ScalarizationCostPassed = UINT_MAX) const;
+  /// I is the optional original context instruction holding the call to the
+  /// intrinsic
+  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
+                            FastMathFlags FMF,
+                            unsigned ScalarizationCostPassed = UINT_MAX,
+                            const Instruction *I = nullptr) const;
 
   /// \returns The cost of Call instructions.
   int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
@@ -1340,9 +1349,9 @@ class TargetTransformInfo::Concept {
   virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                     unsigned Alignment,
                                     unsigned AddressSpace) = 0;
-  virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
-                                     Value *Ptr, bool VariableMask,
-                                     unsigned Alignment) = 0;
+  virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+                                     bool VariableMask, unsigned Alignment,
+                                     const Instruction *I = nullptr) = 0;
   virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                          unsigned Factor,
                                          ArrayRef<unsigned> Indices,
@@ -1355,10 +1364,12 @@ class TargetTransformInfo::Concept {
   virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
                                      bool IsPairwiseForm, bool IsUnsigned) = 0;
   virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                      ArrayRef<Type *> Tys, FastMathFlags FMF,
-                      unsigned ScalarizationCostPassed) = 0;
+                                    ArrayRef<Type *> Tys, FastMathFlags FMF,
+                                    unsigned ScalarizationCostPassed,
+                                    const Instruction *I) = 0;
   virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-         ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) = 0;
+                                    ArrayRef<Value *> Args, FastMathFlags FMF,
+                                    unsigned VF, const Instruction *I) = 0;
   virtual int getCallInstrCost(Function *F, Type *RetTy,
                                ArrayRef<Type *> Tys) = 0;
   virtual unsigned getNumberOfParts(Type *Tp) = 0;
@@ -1759,11 +1770,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                             unsigned AddressSpace) override {
     return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
   }
-  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
-                             Value *Ptr, bool VariableMask,
-                             unsigned Alignment) override {
+  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
+                             bool VariableMask, unsigned Alignment,
+                             const Instruction *I = nullptr) override {
     return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                       Alignment);
+                                       Alignment, I);
   }
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
@@ -1781,15 +1792,18 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                              bool IsPairwiseForm, bool IsUnsigned) override {
     return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned);
    }
-  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
-               FastMathFlags FMF, unsigned ScalarizationCostPassed) override {
-    return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
-                                      ScalarizationCostPassed);
-  }
-  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-       ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) override {
-    return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
-  }
+   int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                             ArrayRef<Type *> Tys, FastMathFlags FMF,
+                             unsigned ScalarizationCostPassed,
+                             const Instruction *I) override {
+     return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
+                                       ScalarizationCostPassed, I);
+   }
+   int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+                             ArrayRef<Value *> Args, FastMathFlags FMF,
+                             unsigned VF, const Instruction *I) override {
+     return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
+   }
   int getCallInstrCost(Function *F, Type *RetTy,
                        ArrayRef<Type *> Tys) override {
     return Impl.getCallInstrCost(F, RetTy, Tys);

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 5c51d30384b7..60065731784a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -481,8 +481,8 @@ class TargetTransformInfoImplBase {
   }
 
   unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
-                                  bool VariableMask,
-                                  unsigned Alignment) {
+                                  bool VariableMask, unsigned Alignment,
+                                  const Instruction *I = nullptr) {
     return 1;
   }
 
@@ -497,11 +497,13 @@ class TargetTransformInfoImplBase {
 
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Type *> Tys, FastMathFlags FMF,
-                                 unsigned ScalarizationCostPassed) {
+                                 unsigned ScalarizationCostPassed,
+                                 const Instruction *I) {
     return 1;
   }
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-            ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
+                                 ArrayRef<Value *> Args, FastMathFlags FMF,
+                                 unsigned VF, const Instruction *I) {
     return 1;
   }
 

diff  --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index d3f8896eca16..f3dd322940d3 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1072,7 +1072,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// Get intrinsic cost based on arguments.
   unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                                  ArrayRef<Value *> Args, FastMathFlags FMF,
-                                 unsigned VF = 1) {
+                                 unsigned VF = 1,
+                                 const Instruction *I = nullptr) {
     unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
     assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
     auto *ConcreteTTI = static_cast<T *>(this);
@@ -1109,16 +1110,17 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       Value *Mask = Args[3];
       bool VarMask = !isa<Constant>(Mask);
       unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
-      return ConcreteTTI->getGatherScatterOpCost(
-          Instruction::Store, Args[0]->getType(), Args[1], VarMask, Alignment);
+      return ConcreteTTI->getGatherScatterOpCost(Instruction::Store,
+                                                 Args[0]->getType(), Args[1],
+                                                 VarMask, Alignment, I);
     }
     case Intrinsic::masked_gather: {
       assert(VF == 1 && "Can't vectorize types here.");
       Value *Mask = Args[2];
       bool VarMask = !isa<Constant>(Mask);
       unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
-      return ConcreteTTI->getGatherScatterOpCost(Instruction::Load, RetTy,
-                                                 Args[0], VarMask, Alignment);
+      return ConcreteTTI->getGatherScatterOpCost(
+          Instruction::Load, RetTy, Args[0], VarMask, Alignment, I);
     }
     case Intrinsic::experimental_vector_reduce_add:
     case Intrinsic::experimental_vector_reduce_mul:
@@ -1180,7 +1182,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// based on types.
   unsigned getIntrinsicInstrCost(
       Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF,
-      unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) {
+      unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max(),
+      const Instruction *I = nullptr) {
     auto *ConcreteTTI = static_cast<T *>(this);
 
     SmallVector<unsigned, 2> ISDs;

diff  --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 865f8975825a..a1b30fc8f12b 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -674,9 +674,10 @@ int TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
 
 int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                                 Value *Ptr, bool VariableMask,
-                                                unsigned Alignment) const {
+                                                unsigned Alignment,
+                                                const Instruction *I) const {
   int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                             Alignment);
+                                             Alignment, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -694,17 +695,21 @@ int TargetTransformInfo::getInterleavedMemoryOpCost(
 }
 
 int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                    ArrayRef<Type *> Tys, FastMathFlags FMF,
-                                    unsigned ScalarizationCostPassed) const {
+                                               ArrayRef<Type *> Tys,
+                                               FastMathFlags FMF,
+                                               unsigned ScalarizationCostPassed,
+                                               const Instruction *I) const {
   int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
-                                            ScalarizationCostPassed);
+                                            ScalarizationCostPassed, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
 int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-           ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) const {
-  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+                                               ArrayRef<Value *> Args,
+                                               FastMathFlags FMF, unsigned VF,
+                                               const Instruction *I) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -1339,8 +1344,8 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
         FMF = FPMO->getFastMathFlags();
 
-      return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
-                                        Args, FMF);
+      return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
+                                   FMF, 1, II);
     }
     return -1;
   default:

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index b7635f078cd7..caa3a4aa31f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -478,14 +478,14 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
 template <typename T>
 int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                      ArrayRef<T *> Args,
-                                      FastMathFlags FMF, unsigned VF) {
+                                      ArrayRef<T *> Args, FastMathFlags FMF,
+                                      unsigned VF, const Instruction *I) {
   if (ID != Intrinsic::fma)
-    return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+    return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
 
   EVT OrigTy = TLI->getValueType(DL, RetTy);
   if (!OrigTy.isSimple()) {
-    return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+    return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
   }
 
   // Legalize the type.
@@ -507,16 +507,17 @@ int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
 }
 
 int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                      ArrayRef<Value*> Args, FastMathFlags FMF,
-                                      unsigned VF) {
-  return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF);
+                                      ArrayRef<Value *> Args, FastMathFlags FMF,
+                                      unsigned VF, const Instruction *I) {
+  return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF, I);
 }
 
 int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                       ArrayRef<Type *> Tys, FastMathFlags FMF,
-                                      unsigned ScalarizationCostPassed) {
+                                      unsigned ScalarizationCostPassed,
+                                      const Instruction *I) {
   return getIntrinsicInstrCost<Type>(ID, RetTy, Tys, FMF,
-                                     ScalarizationCostPassed);
+                                     ScalarizationCostPassed, I);
 }
 
 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
@@ -889,7 +890,7 @@ unsigned GCNTTIImpl::getUserCost(const User *U,
       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
         FMF = FPMO->getFastMathFlags();
       return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
-                                   FMF);
+                                   FMF, 1, II);
     } else {
       return BaseT::getUserCost(U, Operands);
     }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index dee2e7d314d6..d2a25bf2fb90 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -219,15 +219,16 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
                                  Type *Ty,
                                  bool IsPairwise);
   template <typename T>
-  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<T *> Args, FastMathFlags FMF,
-                            unsigned VF);
+  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<T *> Args,
+                            FastMathFlags FMF, unsigned VF,
+                            const Instruction *I = nullptr);
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                             ArrayRef<Type *> Tys, FastMathFlags FMF,
-                            unsigned ScalarizationCostPassed = UINT_MAX);
+                            unsigned ScalarizationCostPassed = UINT_MAX,
+                            const Instruction *I = nullptr);
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                             ArrayRef<Value *> Args, FastMathFlags FMF,
-                            unsigned VF = 1);
+                            unsigned VF = 1, const Instruction *I = nullptr);
   int getMinMaxReductionCost(Type *Ty, Type *CondTy,
                              bool IsPairwiseForm,
                              bool IsUnsigned);

diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index ffac332469ac..3a88575d168a 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -863,16 +863,17 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(
 
 unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                             Value *Ptr, bool VariableMask,
-                                            unsigned Alignment) {
+                                            unsigned Alignment,
+                                            const Instruction *I) {
+  using namespace PatternMatch;
   if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                         Alignment);
+                                         Alignment, I);
 
   assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
   VectorType *VTy = cast<VectorType>(DataTy);
 
   // TODO: Splitting, once we do that.
-  // TODO: trunc/sext/zext the result/input
 
   unsigned NumElems = VTy->getNumElements();
   unsigned EltSize = VTy->getScalarSizeInBits();
@@ -889,19 +890,54 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
   unsigned ScalarCost =
       NumElems * LT.first + BaseT::getScalarizationOverhead(DataTy, {});
 
-  // TODO: Cost extended gathers or trunc stores correctly.
-  if (EltSize * NumElems != 128 || NumElems < 4)
-    return ScalarCost;
   if (Alignment < EltSize / 8)
     return ScalarCost;
 
+  unsigned ExtSize = EltSize;
+  // Check whether there's a single user that asks for an extended type
+  if (I != nullptr) {
+    // Dependent of the caller of this function, a gather instruction will
+    // either have opcode Instruction::Load or be a call to the masked_gather
+    // intrinsic
+    if ((I->getOpcode() == Instruction::Load ||
+         match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
+        I->hasOneUse()) {
+      const User *Us = *I->users().begin();
+      if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
+        // only allow valid type combinations
+        unsigned TypeSize =
+            cast<Instruction>(Us)->getType()->getScalarSizeInBits();
+        if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
+             (TypeSize == 16 && EltSize == 8)) &&
+            TypeSize * NumElems == 128) {
+          ExtSize = TypeSize;
+        }
+      }
+    }
+    // Check whether the input data needs to be truncated
+    TruncInst *T;
+    if ((I->getOpcode() == Instruction::Store ||
+         match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
+        (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
+      // Only allow valid type combinations
+      unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
+      if (((EltSize == 16 && TypeSize == 32) ||
+           (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
+          TypeSize * NumElems == 128)
+        ExtSize = TypeSize;
+    }
+  }
+
+  if (ExtSize * NumElems != 128 || NumElems < 4)
+    return ScalarCost;
+
   // Any (aligned) i32 gather will not need to be scalarised.
-  if (EltSize == 32)
+  if (ExtSize == 32)
     return VectorCost;
   // For smaller types, we need to ensure that the gep's inputs are correctly
-  // extended from a small enough value. Other size (including i64) are
+  // extended from a small enough value. Other sizes (including i64) are
   // scalarized for now.
-  if (EltSize != 8 && EltSize != 16)
+  if (ExtSize != 8 && ExtSize != 16)
     return ScalarCost;
 
   if (auto BC = dyn_cast<BitCastInst>(Ptr))
@@ -911,12 +947,13 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
       return ScalarCost;
     unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
     // Scale needs to be correct (which is only relevant for i16s).
-    if (Scale != 1 && Scale * 8 != EltSize)
+    if (Scale != 1 && Scale * 8 != ExtSize)
       return ScalarCost;
     // And we need to zext (not sext) the indexes from a small enough type.
-    if (auto ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1)))
-      if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= EltSize)
+    if (auto ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
+      if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
         return VectorCost;
+    }
     return ScalarCost;
   }
   return ScalarCost;

diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index cab1514be4ad..f21b348b6476 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -222,7 +222,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
                                  bool UseMaskForGaps = false);
 
   unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
-                                  bool VariableMask, unsigned Alignment);
+                                  bool VariableMask, unsigned Alignment,
+                                  const Instruction *I = nullptr);
 
   bool isLoweredToCall(const Function *F);
   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,

diff  --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 7ec8070f21bc..7f6058a78b61 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -131,19 +131,23 @@ unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
 }
 
 unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-      ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
-  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+                                               ArrayRef<Value *> Args,
+                                               FastMathFlags FMF, unsigned VF,
+                                               const Instruction *I) {
+  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
 }
 
 unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-      ArrayRef<Type*> Tys, FastMathFlags FMF,
-      unsigned ScalarizationCostPassed) {
+                                               ArrayRef<Type *> Tys,
+                                               FastMathFlags FMF,
+                                               unsigned ScalarizationCostPassed,
+                                               const Instruction *I) {
   if (ID == Intrinsic::bswap) {
     std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, RetTy);
     return LT.first + 2;
   }
   return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
-                                      ScalarizationCostPassed);
+                                      ScalarizationCostPassed, I);
 }
 
 unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp,
@@ -209,9 +213,11 @@ unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
 }
 
 unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
-      Value *Ptr, bool VariableMask, unsigned Alignment) {
+                                                Value *Ptr, bool VariableMask,
+                                                unsigned Alignment,
+                                                const Instruction *I) {
   return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                       Alignment);
+                                       Alignment, I);
 }
 
 unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,

diff  --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index ace0d797bbdb..409e1dfdb6b3 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -106,10 +106,12 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
             unsigned VF);
   unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys);
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-            ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF);
+                                 ArrayRef<Value *> Args, FastMathFlags FMF,
+                                 unsigned VF, const Instruction *I);
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-            ArrayRef<Type*> Tys, FastMathFlags FMF,
-            unsigned ScalarizationCostPassed = UINT_MAX);
+                                 ArrayRef<Type *> Tys, FastMathFlags FMF,
+                                 unsigned ScalarizationCostPassed = UINT_MAX,
+                                 const Instruction *I = nullptr);
   unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
             const SCEV *S);
   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
@@ -120,7 +122,8 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
             Type *SubTp);
   unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
-            bool VariableMask, unsigned Alignment);
+                                  bool VariableMask, unsigned Alignment,
+                                  const Instruction *I);
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
             unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
             unsigned AddressSpace, bool UseMaskForCond = false,

diff  --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 202272829018..1bb78edcfb75 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -936,17 +936,21 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
 }
 
 unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-      ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
-  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+                                           ArrayRef<Value *> Args,
+                                           FastMathFlags FMF, unsigned VF,
+                                           const Instruction *I) {
+  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
 }
 
 unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-      ArrayRef<Type*> Tys, FastMathFlags FMF,
-      unsigned ScalarizationCostPassed) {
+                                           ArrayRef<Type *> Tys,
+                                           FastMathFlags FMF,
+                                           unsigned ScalarizationCostPassed,
+                                           const Instruction *I) {
   if (ID == Intrinsic::bswap && ST->hasP9Vector())
     return TLI->getTypeLegalizationCost(DL, RetTy).first;
   return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
-                                      ScalarizationCostPassed);
+                                      ScalarizationCostPassed, I);
 }
 
 bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,

diff  --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index b89e585b0806..5aea5242f242 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -111,10 +111,12 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
                                  bool UseMaskForCond = false,
                                  bool UseMaskForGaps = false);
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-            ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF);
+                                 ArrayRef<Value *> Args, FastMathFlags FMF,
+                                 unsigned VF, const Instruction *I = nullptr);
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-            ArrayRef<Type*> Tys, FastMathFlags FMF,
-            unsigned ScalarizationCostPassed = UINT_MAX);
+                                 ArrayRef<Type *> Tys, FastMathFlags FMF,
+                                 unsigned ScalarizationCostPassed = UINT_MAX,
+                                 const Instruction *I = nullptr);
 
   /// @}
 };

diff  --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index df5286bef817..d088682cf7d3 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1124,20 +1124,22 @@ static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
 
 int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                           ArrayRef<Value *> Args,
-                                          FastMathFlags FMF, unsigned VF) {
+                                          FastMathFlags FMF, unsigned VF,
+                                          const Instruction *I) {
   int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
   if (Cost != -1)
     return Cost;
-  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
 }
 
 int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                           ArrayRef<Type *> Tys,
                                           FastMathFlags FMF,
-                                          unsigned ScalarizationCostPassed) {
+                                          unsigned ScalarizationCostPassed,
+                                          const Instruction *I) {
   int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
   if (Cost != -1)
     return Cost;
-  return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys,
-                                      FMF, ScalarizationCostPassed);
+  return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
+                                      ScalarizationCostPassed, I);
 }

diff  --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index bc4d066881c1..590505769c70 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -101,10 +101,11 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
 
   int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                             ArrayRef<Value *> Args, FastMathFlags FMF,
-                            unsigned VF = 1);
-  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                            ArrayRef<Type *> Tys, FastMathFlags FMF,
-                            unsigned ScalarizationCostPassed = UINT_MAX);
+                            unsigned VF = 1, const Instruction *I = nullptr);
+  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
+                            FastMathFlags FMF,
+                            unsigned ScalarizationCostPassed = UINT_MAX,
+                            const Instruction *I = nullptr);
   /// @}
 };
 

diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 207e1d903eb8..4ac610aef14b 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1887,7 +1887,8 @@ unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
 
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                                       ArrayRef<Type *> Tys, FastMathFlags FMF,
-                                      unsigned ScalarizationCostPassed) {
+                                      unsigned ScalarizationCostPassed,
+                                      const Instruction *I) {
   // Costs should match the codegen from:
   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
@@ -2309,12 +2310,13 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
       return LT.first * Entry->Cost;
   }
 
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF,
+                                      ScalarizationCostPassed, I);
 }
 
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                                       ArrayRef<Value *> Args, FastMathFlags FMF,
-                                      unsigned VF) {
+                                      unsigned VF, const Instruction *I) {
   static const CostTblEntry AVX512CostTbl[] = {
     { ISD::ROTL,       MVT::v8i64,   1 },
     { ISD::ROTL,       MVT::v4i64,   1 },
@@ -2404,7 +2406,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
       return LT.first * Entry->Cost;
   }
 
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF, I);
 }
 
 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -3354,7 +3356,8 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
 /// Calculate the cost of Gather / Scatter operation
 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
                                        Value *Ptr, bool VariableMask,
-                                       unsigned Alignment) {
+                                       unsigned Alignment,
+                                       const Instruction *I = nullptr) {
   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
   unsigned VF = SrcVTy->getVectorNumElements();
   PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());

diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index b9c2dbd78058..4b480649653f 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -138,7 +138,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                             unsigned AddressSpace);
   int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
-                             bool VariableMask, unsigned Alignment);
+                             bool VariableMask, unsigned Alignment,
+                             const Instruction *I);
   int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
                                 const SCEV *Ptr);
 
@@ -146,10 +147,11 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
 
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                             ArrayRef<Type *> Tys, FastMathFlags FMF,
-                            unsigned ScalarizationCostPassed = UINT_MAX);
+                            unsigned ScalarizationCostPassed = UINT_MAX,
+                            const Instruction *I = nullptr);
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
                             ArrayRef<Value *> Args, FastMathFlags FMF,
-                            unsigned VF = 1);
+                            unsigned VF = 1, const Instruction *I = nullptr);
 
   int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                  bool IsPairwiseForm);

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 49e38c37a46e..c6bf118a8c37 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3301,7 +3301,7 @@ unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
     FMF = FPMO->getFastMathFlags();
 
   SmallVector<Value *, 4> Operands(CI->arg_operands());
-  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
+  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI);
 }
 
 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
@@ -5889,7 +5889,7 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
   return TTI.getAddressComputationCost(VectorTy) +
          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
                                     Legal->isMaskRequired(I),
-                                    Alignment ? Alignment->value() : 0);
+                                    Alignment ? Alignment->value() : 0, I);
 }
 
 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,

diff  --git a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
index 0f28d435b5ca..53480f5a2789 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
@@ -134,21 +134,26 @@ define void @gep_v4i32(i32* %base, i16* %base16, i8* %base8, <4 x i32> %ind32, <
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i32, i32* %base, <4 x i32> %ind32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res1, <4 x i32*> %gep1, i32 4, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i32, i32* %base, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res2, <4 x i32*> %gep2, i32 4, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i32, i32* %base, <4 x i32> %indsext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res3, <4 x i32*> %gep3, i32 4, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepu = getelementptr i32, i32* %base, <4 x i32> %ind32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resu, <4 x i32*> %gepu, i32 1, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x i8*> %gepos to <4 x i32*>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resos, <4 x i32*> %geposb, i32 4, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x i16*> %gepbs to <4 x i32*>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef)
@@ -193,21 +198,26 @@ define void @gep_v4f32(float* %base, i16* %base16, i8* %base8, <4 x i32> %ind32,
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr float, float* %base, <4 x i32> %ind32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res1, <4 x float*> %gep1, i32 4, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr float, float* %base, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res2, <4 x float*> %gep2, i32 4, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr float, float* %base, <4 x i32> %indsext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res3, <4 x float*> %gep3, i32 4, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gepu = getelementptr float, float* %base, <4 x i32> %ind32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resu, <4 x float*> %gepu, i32 1, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x i8*> %gepos to <4 x float*>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resos, <4 x float*> %geposb, i32 4, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x i16*> %gepbs to <4 x float*>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef)
@@ -252,14 +262,28 @@ define void @gep_v4i16(i16* %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, i16* %base, <4 x i32> %ind32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res1, <4 x i16*> %gep1, i32 2, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, i16* %base, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep2, i32 2, <4 x i1> %mask, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res2, <4 x i16*> %gep2, i32 2, <4 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, i16* %base, <4 x i32> %indsext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res3, <4 x i16*> %gep3, i32 2, <4 x i1> %mask)
+
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   %gep5 = getelementptr i16, i16* %base, <4 x i16> %ind16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %res5zext = zext <4 x i16> %res5 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   %res5trunc = trunc <4 x i32> %res5zext to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res5trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask)
+
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %res6sext = sext <4 x i16> %res6 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   %res6trunc = trunc <4 x i32> %res6sext to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res6trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask)
+;
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %gep1 = getelementptr i16, i16* %base, <4 x i32> %ind32
@@ -275,43 +299,109 @@ define void @gep_v4i16(i16* %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1>
   %gep3 = getelementptr i16, i16* %base, <4 x i32> %indsext
   %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef)
   call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res3, <4 x i16*> %gep3, i32 2, <4 x i1> %mask)
+  
+  ; result zext
+  %gep5 = getelementptr i16, i16* %base, <4 x i16> %ind16
+  %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
+  %res5zext = zext <4 x i16> %res5 to <4 x i32>
+  %res5trunc = trunc <4 x i32> %res5zext to <4 x i16>
+  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res5trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask)
+  
+  ; result sext
+  %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
+  %res6sext = sext <4 x i16> %res6 to <4 x i32>
+  %res6trunc = trunc <4 x i32> %res6sext to <4 x i16>
+  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res6trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask)
+  
   ret void
 }
 
-define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i1> %mask)  {
+define void @gep_v4i8(i8* %base, <4 x i8> %ind8, <4 x i1> %mask)  {
+; CHECK-LABEL: 'gep_v4i8'
+
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   %gep5 = getelementptr i8, i8* %base, <4 x i8> %ind8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %res5zext = zext <4 x i8> %res5 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   %res5trunc = trunc <4 x i32> %res5zext to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res5trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask)
+
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   %res6sext = sext <4 x i8> %res6 to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   %res6trunc = trunc <4 x i32> %res6sext to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction:   call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res6trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask)
+;
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  
+  ; result zext
+  %gep5 = getelementptr i8, i8* %base, <4 x i8> %ind8
+  %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
+  %res5zext = zext <4 x i8> %res5 to <4 x i32>
+  %res5trunc = trunc <4 x i32> %res5zext to <4 x i8>
+  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res5trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask)
+  
+  ; result sext
+  %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
+  %res6sext = sext <4 x i8> %res6 to <4 x i32>
+  %res6trunc = trunc <4 x i32> %res6sext to <4 x i8>
+  call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res6trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask)
+  
+  ret void
+}
+
+define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i8> %ind8, <8 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v8i16'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, i16* %base, <8 x i32> %ind32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res1, <8 x i16*> %gep1, i32 2, <8 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, i16* %base, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res2, <8 x i16*> %gep2, i32 2, <8 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, i16* %base, <8 x i32> %indsext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res3, <8 x i16*> %gep3, i32 2, <8 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resu, <8 x i16*> %gep2, i32 1, <8 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x i16*>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resos, <8 x i16*> %geposb, i32 2, <8 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x i16*>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resbs, <8 x i16*> %gepbsb, i32 2, <8 x i1> %mask)
+
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction:   %indzext4 = zext <8 x i16> %ind16 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   %gep4 = getelementptr i16, i16* %base, <8 x i32> %indzext4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %indtrunc = trunc <8 x i32> %ind32 to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %indtrunc, <8 x i16*> %gep4, i32 2, <8 x i1> %mask)
+
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction:   %ressext = sext <8 x i16> %res to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %restrunc = trunc <8 x i32> %ressext to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %restrunc, <8 x i16*> %gep4, i32 4, <8 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  ; no offset ext
   %gep1 = getelementptr i16, i16* %base, <8 x i32> %ind32
   %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef)
   call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res1, <8 x i16*> %gep1, i32 2, <8 x i1> %mask)
 
+  ; offset zext
   %indzext = zext <8 x i16> %ind16 to <8 x i32>
   %gep2 = getelementptr i16, i16* %base, <8 x i32> %indzext
   %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef)
   call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res2, <8 x i16*> %gep2, i32 2, <8 x i1> %mask)
 
+  ; offset sext
   %indsext = sext <8 x i16> %ind16 to <8 x i32>
   %gep3 = getelementptr i16, i16* %base, <8 x i32> %indsext
   %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef)
@@ -332,6 +422,19 @@ define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <
   %gepbsb = bitcast <8 x i32*> %gepbs to <8 x i16*>
   %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef)
   call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resbs, <8 x i16*> %gepbsb, i32 2, <8 x i1> %mask)
+
+  ; trunc scatter
+  %indzext4 = zext <8 x i16> %ind16 to <8 x i32>
+  %gep4 = getelementptr i16, i16* %base, <8 x i32> %indzext4
+  %indtrunc = trunc <8 x i32> %ind32 to <8 x i16>
+  call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %indtrunc, <8 x i16*> %gep4, i32 2, <8 x i1> %mask)
+  
+  ; ext result to <8 x i32>
+  %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef)
+  %ressext = sext <8 x i16> %res to <8 x i32>
+  %restrunc = trunc <8 x i32> %ressext to <8 x i16>
+  call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %restrunc, <8 x i16*> %gep4, i32 4, <8 x i1> %mask)
+  
   ret void
 }
 
@@ -340,35 +443,44 @@ define void @gep_v8f16(half* %base, i8* %base8, i32* %base32, <8 x i32> %ind32,
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr half, half* %base, <8 x i32> %ind32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res1, <8 x half*> %gep1, i32 2, <8 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr half, half* %base, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res2, <8 x half*> %gep2, i32 2, <8 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr half, half* %base, <8 x i32> %indsext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res3, <8 x half*> %gep3, i32 2, <8 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resu, <8 x half*> %gep2, i32 1, <8 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x half*>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resos, <8 x half*> %geposb, i32 2, <8 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x half*>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resbs, <8 x half*> %gepbsb, i32 2, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+
+  ; no offset ext
   %gep1 = getelementptr half, half* %base, <8 x i32> %ind32
   %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef)
   call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res1, <8 x half*> %gep1, i32 2, <8 x i1> %mask)
 
+  ; offset zext
   %indzext = zext <8 x i16> %ind16 to <8 x i32>
   %gep2 = getelementptr half, half* %base, <8 x i32> %indzext
   %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef)
   call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res2, <8 x half*> %gep2, i32 2, <8 x i1> %mask)
 
+  ; offset sext
   %indsext = sext <8 x i16> %ind16 to <8 x i32>
   %gep3 = getelementptr half, half* %base, <8 x i32> %indsext
   %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef)
@@ -389,6 +501,42 @@ define void @gep_v8f16(half* %base, i8* %base8, i32* %base32, <8 x i32> %ind32,
   %gepbsb = bitcast <8 x i32*> %gepbs to <8 x half*>
   %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef)
   call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resbs, <8 x half*> %gepbsb, i32 2, <8 x i1> %mask)
+
+  ret void
+}
+
+define void @gep_v8i8(i8* %base, <8 x i8> %ind8, <8 x i1> %mask)  {
+; CHECK-LABEL: 'gep_v8i8'
+
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction:   %indzext = zext <8 x i8> %ind8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   %gep5 = getelementptr i8, i8* %base, <8 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %res5zext = zext <8 x i8> %res5 to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   %res5trunc = trunc <8 x i16> %res5zext to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res5trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask)
+
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction:   %res6sext = sext <8 x i8> %res6 to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   %res6trunc = trunc <8 x i16> %res6sext to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction:   call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res6trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask)
+
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+
+  ; result zext
+  %indzext = zext <8 x i8> %ind8 to <8 x i32>
+  %gep5 = getelementptr i8, i8* %base, <8 x i32> %indzext
+  %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
+  %res5zext = zext <8 x i8> %res5 to <8 x i16>
+  %res5trunc = trunc <8 x i16> %res5zext to <8 x i8>
+  call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res5trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask)
+  
+  ; result sext
+  %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
+  %res6sext = sext <8 x i8> %res6 to <8 x i16>
+  %res6trunc = trunc <8 x i16> %res6sext to <8 x i8>
+  call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res6trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask)
+  
   ret void
 }
 
@@ -397,29 +545,40 @@ define void @gep_v16i8(i8* %base, i16* %base16, <16 x i8> %ind8, <16 x i32> %ind
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i8, i8* %base, <16 x i32> %ind32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res1, <16 x i8*> %gep1, i32 2, <16 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %indzext = zext <16 x i8> %ind8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i8, i8* %base, <16 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res2, <16 x i8*> %gep2, i32 2, <16 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %indsext = sext <16 x i8> %ind8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i8, i8* %base, <16 x i32> %indsext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res3, <16 x i8*> %gep3, i32 2, <16 x i1> %mask)
+
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <16 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <16 x i16*> %gepbs to <16 x i8*>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbsb, i32 2, <16 x i1> %mask)
+
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction:   %indzext4 = zext <16 x i8> %ind8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction:   %gep4 = getelementptr i8, i8* %base, <16 x i32> %indzext
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction:   %indtrunc = trunc <16 x i32> %ind32 to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction:   call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %indtrunc, <16 x i8*> %gep4, i32 2, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  ; no offset ext
   %gep1 = getelementptr i8, i8* %base, <16 x i32> %ind32
   %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef)
   call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res1, <16 x i8*> %gep1, i32 2, <16 x i1> %mask)
 
+  ; offset zext
   %indzext = zext <16 x i8> %ind8 to <16 x i32>
   %gep2 = getelementptr i8, i8* %base, <16 x i32> %indzext
   %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef)
   call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res2, <16 x i8*> %gep2, i32 2, <16 x i1> %mask)
 
+  ; offset sext
   %indsext = sext <16 x i8> %ind8 to <16 x i32>
   %gep3 = getelementptr i8, i8* %base, <16 x i32> %indsext
   %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef)
@@ -430,6 +589,13 @@ define void @gep_v16i8(i8* %base, i16* %base16, <16 x i8> %ind8, <16 x i32> %ind
   %gepbsb = bitcast <16 x i16*> %gepbs to <16 x i8*>
   %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef)
   call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbsb, i32 2, <16 x i1> %mask)
+
+  ; trunc scatter
+  %indzext4 = zext <16 x i8> %ind8 to <16 x i32>
+  %gep4 = getelementptr i8, i8* %base, <16 x i32> %indzext
+  %indtrunc = trunc <16 x i32> %ind32 to <16 x i8>
+  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %indtrunc, <16 x i8*> %gep4, i32 2, <16 x i1> %mask)
+  
   ret void
 }