[llvm] 8fb0559 - [VectorCombine] allow vector loads with mismatched insert type

Wed Sep 2 05:11:46 PDT 2020

Author: Sanjay Patel
Date: 2020-09-02T08:11:36-04:00
New Revision: 8fb055932c085da21f3b721995a06f42006744bd

URL: https://github.com/llvm/llvm-project/commit/8fb055932c085da21f3b721995a06f42006744bd
DIFF: https://github.com/llvm/llvm-project/commit/8fb055932c085da21f3b721995a06f42006744bd.diff

LOG: [VectorCombine] allow vector loads with mismatched insert type

This is an enhancement to D81766 to allow loading the minimum target
vector type into an IR vector with a different number of elements.

In one of the motivating tests from PR16739, SLP creates <2 x float>
load ops mixed with <4 x float> insert ops, so we want to handle that
pattern in addition to potential oversized vectors created by the
vectorizers.

For now, we are assuming the insert/extract subvector with undef is
free because there is no exact corresponding TTI modeling for that.

Differential Revision: https://reviews.llvm.org/D86160

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 1cc0e40da3a2..a954b9b29315 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -100,36 +100,36 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   Type *ScalarTy = Scalar->getType();
   if (!Load || !Load->isSimple())
     return false;
+  auto *Ty = dyn_cast<FixedVectorType>(I.getType());
+  if (!Ty)
+    return false;
 
   // TODO: Extend this to match GEP with constant offsets.
   Value *PtrOp = Load->getPointerOperand()->stripPointerCasts();
   assert(isa<PointerType>(PtrOp->getType()) && "Expected a pointer type");
 
-  unsigned VectorSize = TTI.getMinVectorRegisterBitWidth();
+  unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
   uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
-  if (!ScalarSize || !VectorSize || VectorSize % ScalarSize != 0)
+  if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0)
     return false;
 
   // Check safety of replacing the scalar load with a larger vector load.
-  unsigned VecNumElts = VectorSize / ScalarSize;
-  auto *VectorTy = VectorType::get(ScalarTy, VecNumElts, false);
-  // TODO: Allow insert/extract subvector if the type does not match.
-  if (VectorTy != I.getType())
-    return false;
+  unsigned MinVecNumElts = MinVectorSize / ScalarSize;
+  auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
   Align Alignment = Load->getAlign();
   const DataLayout &DL = I.getModule()->getDataLayout();
-  if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT))
+  if (!isSafeToLoadUnconditionally(PtrOp, MinVecTy, Alignment, DL, Load, &DT))
     return false;
 
   unsigned AS = Load->getPointerAddressSpace();
 
   // Original pattern: insertelt undef, load [free casts of] ScalarPtr, 0
   int OldCost = TTI.getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, AS);
-  APInt DemandedElts = APInt::getOneBitSet(VecNumElts, 0);
-  OldCost += TTI.getScalarizationOverhead(VectorTy, DemandedElts, true, false);
+  APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
+  OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, true, false);
 
   // New pattern: load VecPtr
-  int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment, AS);
+  int NewCost = TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
 
   // We can aggressively convert to the vector form because the backend can
   // invert this transform if it does not result in a performance win.
@@ -139,8 +139,18 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // It is safe and potentially profitable to load a vector directly:
   // inselt undef, load Scalar, 0 --> load VecPtr
   IRBuilder<> Builder(Load);
-  Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo(AS));
-  LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment);
+  Value *CastedPtr = Builder.CreateBitCast(PtrOp, MinVecTy->getPointerTo(AS));
+  Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
+
+  // If the insert type does not match the target's minimum vector type,
+  // use an identity shuffle to shrink/grow the vector.
+  if (Ty != MinVecTy) {
+    unsigned OutputNumElts = Ty->getNumElements();
+    SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
+    for (unsigned i = 0; i < OutputNumElts && i < MinVecNumElts; ++i)
+      Mask[i] = i;
+    VecLd = Builder.CreateShuffleVector(VecLd, UndefValue::get(MinVecTy), Mask);
+  }
   replaceValue(I, *VecLd);
   ++NumVecLoad;
   return true;

diff  --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index e24ffb8da66f..f0c5b6ef7ad8 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -346,12 +346,11 @@ define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(1
   ret <4 x float> %r
 }
 
-; TODO: Should load v4i32.
-
 define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v8i32(
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %s = load i32, i32* %p, align 4
@@ -359,13 +358,10 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
   ret <8 x i32> %r
 }
 
-; TODO: Should load v4i32.
-
 define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v8i32(
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32*
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %b = bitcast <4 x i32>* %p to i32*
@@ -374,12 +370,11 @@ define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceabl
   ret <8 x i32> %r
 }
 
-; TODO: Should load v4f32.
-
 define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v16f32(
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <16 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -387,12 +382,11 @@ define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16)
   ret <16 x float> %r
 }
 
-; TODO: Should load v4f32.
-
 define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v2f32(
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %s = load float, float* %p, align 4