[llvm] 0da1e7e - [SVE] Remove calls to VectorType::getNumElements from X86

Mon Jun 29 11:10:51 PDT 2020

Author: Christopher Tetreault
Date: 2020-06-29T11:10:35-07:00
New Revision: 0da1e7ebf9a82e49858e77a5bbe1a14f08ef00d7

URL: https://github.com/llvm/llvm-project/commit/0da1e7ebf9a82e49858e77a5bbe1a14f08ef00d7
DIFF: https://github.com/llvm/llvm-project/commit/0da1e7ebf9a82e49858e77a5bbe1a14f08ef00d7.diff

LOG: [SVE] Remove calls to VectorType::getNumElements from X86

Reviewers: efriedma, RKSimon, craig.topper, fpetrogalli, c-rhodes

Reviewed By: RKSimon

Subscribers: tschuett, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D82508

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86InterleavedAccess.cpp
    llvm/lib/Target/X86/X86PartialReduction.cpp
    llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/lib/Target/X86/X86TargetTransformInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 00ac238f284b..a19e12766e10 100644

--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -69,7 +69,7 @@ class X86InterleavedAccessGroup {
 
   /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
   /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
-  void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
+  void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T,
                  SmallVectorImpl<Instruction *> &DecomposedVectors);
 
   /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
@@ -165,7 +165,7 @@ bool X86InterleavedAccessGroup::isSupported() const {
 }
 
 void X86InterleavedAccessGroup::decompose(
-    Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
+    Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy,
     SmallVectorImpl<Instruction *> &DecomposedVectors) {
   assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
          "Expected Load or Shuffle");
@@ -727,13 +727,13 @@ void X86InterleavedAccessGroup::transpose_4x4(
 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
   SmallVector<Instruction *, 4> DecomposedVectors;
   SmallVector<Value *, 4> TransposedVectors;
-  VectorType *ShuffleTy = Shuffles[0]->getType();
+  auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
 
   if (isa<LoadInst>(Inst)) {
     // Try to generate target-sized register(/instruction).
     decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
 
-    auto *ShuffleEltTy = cast<VectorType>(Inst->getType());
+    auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
     unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
     // Perform matrix-transposition in order to compute interleaved
     // results by generating some sort of (optimized) target-specific
@@ -832,7 +832,8 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
-  assert(SVI->getType()->getNumElements() % Factor == 0 &&
+  assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor ==
+             0 &&
          "Invalid interleaved store");
 
   // Holds the indices of SVI that correspond to the starting index of each

diff  --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index 65caeab1d1cf..8784a3df1773 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -68,7 +68,7 @@ bool X86PartialReduction::tryMAddReplacement(Instruction *Op) {
     return false;
 
   // Need at least 8 elements.
-  if (cast<VectorType>(Op->getType())->getNumElements() < 8)
+  if (cast<FixedVectorType>(Op->getType())->getNumElements() < 8)
     return false;
 
   // Element type should be i32.
@@ -136,7 +136,7 @@ bool X86PartialReduction::tryMAddReplacement(Instruction *Op) {
 
   IRBuilder<> Builder(Mul);
 
-  auto *MulTy = cast<VectorType>(Op->getType());
+  auto *MulTy = cast<FixedVectorType>(Op->getType());
   unsigned NumElts = MulTy->getNumElements();
 
   // Extract even elements and odd elements and add them together. This will
@@ -211,7 +211,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
 
   IRBuilder<> Builder(SI);
 
-  auto *OpTy = cast<VectorType>(Op->getType());
+  auto *OpTy = cast<FixedVectorType>(Op->getType());
   unsigned NumElts = OpTy->getNumElements();
 
   unsigned IntrinsicNumElts;
@@ -265,7 +265,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
   unsigned Stages = Log2_32(NumSplits);
   for (unsigned s = Stages; s > 0; --s) {
     unsigned NumConcatElts =
-        cast<VectorType>(Ops[0]->getType())->getNumElements() * 2;
+        cast<FixedVectorType>(Ops[0]->getType())->getNumElements() * 2;
     for (unsigned i = 0; i != 1U << (s - 1); ++i) {
       SmallVector<int, 64> ConcatMask(NumConcatElts);
       std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
@@ -275,13 +275,14 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
 
   // At this point the final value should be in Ops[0]. Now we need to adjust
   // it to the final original type.
-  NumElts = cast<VectorType>(OpTy)->getNumElements();
+  NumElts = cast<FixedVectorType>(OpTy)->getNumElements();
   if (NumElts == 2) {
     // Extract down to 2 elements.
     Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ArrayRef<int>{0, 1});
   } else if (NumElts >= 8) {
     SmallVector<int, 32> ConcatMask(NumElts);
-    unsigned SubElts = cast<VectorType>(Ops[0]->getType())->getNumElements();
+    unsigned SubElts =
+        cast<FixedVectorType>(Ops[0]->getType())->getNumElements();
     for (unsigned i = 0; i != SubElts; ++i)
       ConcatMask[i] = i;
     for (unsigned i = SubElts; i != NumElts; ++i)
@@ -309,7 +310,7 @@ static Value *matchAddReduction(const ExtractElementInst &EE) {
   if (!BO || BO->getOpcode() != Instruction::Add || !BO->hasOneUse())
     return nullptr;
 
-  unsigned NumElems = cast<VectorType>(BO->getType())->getNumElements();
+  unsigned NumElems = cast<FixedVectorType>(BO->getType())->getNumElements();
   // Ensure the reduction size is a power of 2.
   if (!isPowerOf2_32(NumElems))
     return nullptr;

diff  --git a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 3408dab03656..de528299654c 100644
--- a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -36,7 +36,7 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
   //
   //   <4 x i32> <i32 -2147483648, i32 -2147483648,
   //              i32 -2147483648, i32 -2147483648>
-  auto *CstTy = dyn_cast<VectorType>(C->getType());
+  auto *CstTy = dyn_cast<FixedVectorType>(C->getType());
   if (!CstTy)
     return false;
 

diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index b3f7fd2a9d73..cc18e55656ef 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -973,7 +973,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
       // FIXME: Remove some of the alignment restrictions.
       // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
       // vectors.
-      int OrigSubElts = cast<VectorType>(SubTp)->getNumElements();
+      int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
       if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
           (NumSubElts % OrigSubElts) == 0 &&
           LT.second.getVectorElementType() ==
@@ -1047,7 +1047,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
     if (LegalVT.isVector() &&
         LegalVT.getVectorElementType().getSizeInBits() ==
             BaseTp->getElementType()->getPrimitiveSizeInBits() &&
-        LegalVT.getVectorNumElements() < BaseTp->getNumElements()) {
+        LegalVT.getVectorNumElements() <
+            cast<FixedVectorType>(BaseTp)->getNumElements()) {
 
       unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
       unsigned LegalVTSize = LegalVT.getStoreSize();
@@ -2935,7 +2936,8 @@ unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
         // 128-bit vector is free.
         // NOTE: This assumes legalization widens vXf32 vectors.
         if (MScalarTy == MVT::f32)
-          for (unsigned i = 0, e = Ty->getNumElements(); i < e; i += 4)
+          for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
+               i < e; i += 4)
             if (DemandedElts[i])
               Cost--;
       }
@@ -2951,7 +2953,8 @@ unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
       // vector elements, which represents the number of unpacks we'll end up
       // performing.
       unsigned NumElts = LT.second.getVectorNumElements();
-      unsigned Pow2Elts = PowerOf2Ceil(Ty->getNumElements());
+      unsigned Pow2Elts =
+          PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
       Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
     }
   }
@@ -2983,7 +2986,7 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   }
 
   // Handle non-power-of-two vectors such as <3 x float>
-  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
+  if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
     unsigned NumElem = VTy->getNumElements();
 
     // Handle a few common cases:
@@ -3036,7 +3039,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
   bool IsLoad = (Instruction::Load == Opcode);
   bool IsStore = (Instruction::Store == Opcode);
 
-  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
+  auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
   if (!SrcVTy)
     // To calculate scalar take the regular cost, without mask
     return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
@@ -3181,7 +3184,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
 
   MVT MTy = LT.second;
 
-  auto *ValVTy = cast<VectorType>(ValTy);
+  auto *ValVTy = cast<FixedVectorType>(ValTy);
 
   unsigned ArithmeticCost = 0;
   if (LT.first != 1 && MTy.isVector() &&
@@ -3562,7 +3565,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
         return Entry->Cost;
   }
 
-  auto *ValVTy = cast<VectorType>(ValTy);
+  auto *ValVTy = cast<FixedVectorType>(ValTy);
   unsigned NumVecElts = ValVTy->getNumElements();
 
   auto *Ty = ValVTy;
@@ -3850,7 +3853,7 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
                                 Align Alignment, unsigned AddressSpace) {
 
   assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
-  unsigned VF = cast<VectorType>(SrcVTy)->getNumElements();
+  unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
 
   // Try to reduce index size from 64 bit (default for GEP)
   // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
@@ -3921,7 +3924,7 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
                                 bool VariableMask, Align Alignment,
                                 unsigned AddressSpace) {
-  unsigned VF = cast<VectorType>(SrcVTy)->getNumElements();
+  unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
   APInt DemandedElts = APInt::getAllOnesValue(VF);
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
@@ -3969,7 +3972,7 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
     return 1;
 
   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
-  unsigned VF = cast<VectorType>(SrcVTy)->getNumElements();
+  unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
   PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
   if (!PtrTy && Ptr->getType()->isVectorTy())
     PtrTy = dyn_cast<PointerType>(
@@ -4020,7 +4023,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
 
   // The backend can't handle a single element vector.
   if (isa<VectorType>(DataTy) &&
-      cast<VectorType>(DataTy)->getNumElements() == 1)
+      cast<FixedVectorType>(DataTy)->getNumElements() == 1)
     return false;
   Type *ScalarTy = DataTy->getScalarType();
 
@@ -4085,7 +4088,7 @@ bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
     return false;
 
   // The backend can't handle a single element vector.
-  if (cast<VectorType>(DataTy)->getNumElements() == 1)
+  if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
     return false;
 
   Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
@@ -4124,7 +4127,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
   // In this case we can reject non-power-of-2 vectors.
   // We also reject single element vectors as the type legalizer can't
   // scalarize it.
-  if (auto *DataVTy = dyn_cast<VectorType>(DataTy)) {
+  if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
     unsigned NumElts = DataVTy->getNumElements();
     if (NumElts == 1 || !isPowerOf2_32(NumElts))
       return false;
@@ -4245,9 +4248,9 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
 // shuffles. We therefore use a lookup table instead, filled according to
 // the instruction sequences that codegen currently generates.
 int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
-    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
-    bool UseMaskForCond, bool UseMaskForGaps) {
+    unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+    ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+    TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
 
   if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -4274,8 +4277,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
                                              Alignment, AddressSpace,
                                              CostKind);
 
-  unsigned VF = cast<VectorType>(VecTy)->getNumElements() / Factor;
-  Type *ScalarTy = cast<VectorType>(VecTy)->getElementType();
+  unsigned VF = VecTy->getNumElements() / Factor;
+  Type *ScalarTy = VecTy->getElementType();
 
   // Calculate the number of memory operations (NumOfMemOps), required
   // for load/store the VecTy.
@@ -4284,9 +4287,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
 
   // Get the cost of one memory operation.
-  auto *SingleMemOpTy =
-      FixedVectorType::get(cast<VectorType>(VecTy)->getElementType(),
-                           LegalVT.getVectorNumElements());
+  auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
+                                             LegalVT.getVectorNumElements());
   unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
                                        MaybeAlign(Alignment), AddressSpace,
                                        CostKind);
@@ -4363,9 +4365,9 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
 // \p Factor - the factor of interleaving.
 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
 int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
-    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
-    bool UseMaskForCond, bool UseMaskForGaps) {
+    unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+    ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+    TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
 
   if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -4384,14 +4386,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
 
   // Get the cost of one memory operation.
-  auto *SingleMemOpTy =
-      FixedVectorType::get(cast<VectorType>(VecTy)->getElementType(),
-                           LegalVT.getVectorNumElements());
+  auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
+                                             LegalVT.getVectorNumElements());
   unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
                                        MaybeAlign(Alignment), AddressSpace,
                                        CostKind);
 
-  unsigned VF = cast<VectorType>(VecTy)->getNumElements() / Factor;
+  unsigned VF = VecTy->getNumElements() / Factor;
   MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
 
   if (Opcode == Instruction::Load) {
@@ -4423,9 +4424,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
 
     unsigned NumOfLoadsInInterleaveGrp =
         Indices.size() ? Indices.size() : Factor;
-    auto *ResultTy = FixedVectorType::get(
-        cast<VectorType>(VecTy)->getElementType(),
-        cast<VectorType>(VecTy)->getNumElements() / Factor);
+    auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
+                                          VecTy->getNumElements() / Factor);
     unsigned NumOfResults =
         getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
         NumOfLoadsInInterleaveGrp;
@@ -4501,13 +4501,13 @@ int X86TTIImpl::getInterleavedMemoryOpCost(
     return false;
   };
   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
-    return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
-                                            Alignment, AddressSpace, CostKind,
-                                            UseMaskForCond, UseMaskForGaps);
+    return getInterleavedMemoryOpCostAVX512(
+        Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
+        AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
   if (ST->hasAVX2())
-    return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
-                                          Alignment, AddressSpace, CostKind,
-                                          UseMaskForCond, UseMaskForGaps);
+    return getInterleavedMemoryOpCostAVX2(
+        Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
+        AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace, CostKind,

diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index d036ffb9335b..b97072893cc2 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -174,13 +174,13 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
   int getInterleavedMemoryOpCostAVX512(
-      unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      Align Alignment, unsigned AddressSpace,
+      unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+      ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
       TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);
   int getInterleavedMemoryOpCostAVX2(
-      unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      Align Alignment, unsigned AddressSpace,
+      unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+      ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
       TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
       bool UseMaskForCond = false, bool UseMaskForGaps = false);