[llvm] 43bdac2 - [VectorCombine] try to create vector loads from scalar loads

Sun Aug 9 06:19:24 PDT 2020

Author: Sanjay Patel
Date: 2020-08-09T09:05:06-04:00
New Revision: 43bdac290663f4424f9fb3920c47c7288a2aabb4

URL: https://github.com/llvm/llvm-project/commit/43bdac290663f4424f9fb3920c47c7288a2aabb4
DIFF: https://github.com/llvm/llvm-project/commit/43bdac290663f4424f9fb3920c47c7288a2aabb4.diff

LOG: [VectorCombine] try to create vector loads from scalar loads

This patch was adjusted to match the most basic pattern that starts with an insertelement
(so there's no extract created here). Hopefully, that removes any concern about
interfering with other passes. Ie, the transform should almost always be profitable.

We could make an argument that this could be part of canonicalization, but we
conservatively try not to create vector ops from scalar ops in passes like instcombine.

If the transform is not profitable, the backend should be able to re-scalarize the load.

Differential Revision: https://reviews.llvm.org/D81766

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 64b41bf9cefa..688ea8fa2343 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -33,6 +34,7 @@ using namespace llvm;
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "vector-combine"
+STATISTIC(NumVecLoad, "Number of vector loads formed");
 STATISTIC(NumVecCmp, "Number of vector compares formed");
 STATISTIC(NumVecBO, "Number of vector binops formed");
 STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
@@ -65,6 +67,7 @@ class VectorCombine {
   const TargetTransformInfo &TTI;
   const DominatorTree &DT;
 
+  bool vectorizeLoadInsert(Instruction &I);
   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
                                         ExtractElementInst *Ext1,
                                         unsigned PreferredExtractIndex) const;
@@ -88,6 +91,61 @@ static void replaceValue(Value &Old, Value &New) {
   New.takeName(&Old);
 }
 
+bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
+  // Match insert of scalar load.
+  Value *Scalar;
+  if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())))
+    return false;
+  auto *Load = dyn_cast<LoadInst>(Scalar);
+  Type *ScalarTy = Scalar->getType();
+  if (!Load || !Load->isSimple())
+    return false;
+
+  // TODO: Extend this to match GEP with constant offsets.
+  Value *PtrOp = Load->getPointerOperand()->stripPointerCasts();
+  assert(isa<PointerType>(PtrOp->getType()) && "Expected a pointer type");
+
+  unsigned VectorSize = TTI.getMinVectorRegisterBitWidth();
+  uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
+  if (!ScalarSize || VectorSize % ScalarSize != 0)
+    return false;
+
+  // Check safety of replacing the scalar load with a larger vector load.
+  unsigned VecNumElts = VectorSize / ScalarSize;
+  auto *VectorTy = VectorType::get(ScalarTy, VecNumElts, false);
+  // TODO: Allow insert/extract subvector if the type does not match.
+  if (VectorTy != I.getType())
+    return false;
+  Align Alignment = Load->getAlign();
+  const DataLayout &DL = I.getModule()->getDataLayout();
+  if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT))
+    return false;
+
+  // Original pattern: insertelt undef, load [free casts of] ScalarPtr, 0
+  int OldCost = TTI.getMemoryOpCost(Instruction::Load, ScalarTy, Alignment,
+                                    Load->getPointerAddressSpace());
+  APInt DemandedElts = APInt::getOneBitSet(VecNumElts, 0);
+  OldCost += TTI.getScalarizationOverhead(VectorTy, DemandedElts, true, false);
+
+  // New pattern: load VecPtr
+  int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment,
+                                    Load->getPointerAddressSpace());
+
+  // We can aggressively convert to the vector form because the backend can
+  // invert this transform if it does not result in a performance win.
+  if (OldCost < NewCost)
+    return false;
+
+  // It is safe and potentially profitable to load a vector directly:
+  // inselt undef, load Scalar, 0 --> load VecPtr
+  IRBuilder<> Builder(Load);
+  Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo());
+  LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment);
+  replaceValue(I, *VecLd);
+  ++NumVecLoad;
+  return true;
+}
+
 /// Determine which, if any, of the inputs should be replaced by a shuffle
 /// followed by extract from a 
diff erent index.
 ExtractElementInst *VectorCombine::getShuffleExtract(
@@ -625,6 +683,7 @@ bool VectorCombine::run() {
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       Builder.SetInsertPoint(&I);
+      MadeChange |= vectorizeLoadInsert(I);
       MadeChange |= foldExtractExtract(I);
       MadeChange |= foldBitcastShuf(I);
       MadeChange |= scalarizeBinopOrCmp(I);

diff  --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index 6f229044cf56..edd1e4af099e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -174,8 +174,8 @@ define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable
 
 define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32(
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -185,9 +185,7 @@ define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p
 
 define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x float>* [[P:%.*]] to float*
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %b = bitcast <4 x float>* %p to float*
@@ -196,10 +194,12 @@ define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenc
   ret <4 x float> %r
 }
 
+; Element type does not change cost.
+
 define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v4i32(
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %s = load i32, i32* %p, align 4
@@ -207,11 +207,12 @@ define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
   ret <4 x i32> %r
 }
 
+; Pointer type does not change cost.
+
 define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v4i32(
-; CHECK-NEXT:    [[B:%.*]] = bitcast <16 x i8>* [[P:%.*]] to i32*
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b = bitcast <16 x i8>* %p to i32*
@@ -220,11 +221,11 @@ define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceabl
   ret <4 x i32> %r
 }
 
+; This is canonical form for vector element access.
+
 define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[GEP]], align 16
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0
+; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
@@ -233,11 +234,13 @@ define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenc
   ret <4 x float> %r
 }
 
+; If there are enough dereferenceable bytes, we can offset the vector load.
+
 define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
+; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
@@ -246,6 +249,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl
   ret <8 x i16> %r
 }
 
+; Negative test - can't safely load the offset vector, but could load+shuffle.
+
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
@@ -259,11 +264,13 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 derefere
   ret <8 x i16> %r
 }
 
+; If there are enough dereferenceable bytes, we can offset the vector load.
+
 define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
+; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
@@ -272,6 +279,8 @@ define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl
   ret <8 x i16> %r
 }
 
+; Negative test - can't safely load the offset vector, but could load+shuffle.
+
 define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(31) %p) {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
@@ -285,6 +294,8 @@ define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 derefere
   ret <8 x i16> %r
 }
 
+; Negative test - do not alter volatile.
+
 define <4 x float> @load_f32_insert_v4f32_volatile(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_volatile(
 ; CHECK-NEXT:    [[S:%.*]] = load volatile float, float* [[P:%.*]], align 4
@@ -296,6 +307,8 @@ define <4 x float> @load_f32_insert_v4f32_volatile(float* align 16 dereferenceab
   ret <4 x float> %r
 }
 
+; Negative test? - pointer is not as aligned as load.
+
 define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_align(
 ; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
@@ -307,6 +320,8 @@ define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(1
   ret <4 x float> %r
 }
 
+; Negative test - not enough bytes.
+
 define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(15) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_deref(
 ; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
@@ -318,6 +333,8 @@ define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(1
   ret <4 x float> %r
 }
 
+; TODO: Should load v4i32.
+
 define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v8i32(
 ; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
@@ -329,6 +346,8 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
   ret <8 x i32> %r
 }
 
+; TODO: Should load v4i32.
+
 define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v8i32(
 ; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32*