[llvm-branch-commits] [llvm] d399f87 - [VectorCombine] make load transform poison-safe

Mon Dec 14 14:46:52 PST 2020

Author: Sanjay Patel
Date: 2020-12-14T17:42:01-05:00
New Revision: d399f870b5a94b9dcc1817ed69fec88c325bb817

URL: https://github.com/llvm/llvm-project/commit/d399f870b5a94b9dcc1817ed69fec88c325bb817
DIFF: https://github.com/llvm/llvm-project/commit/d399f870b5a94b9dcc1817ed69fec88c325bb817.diff

LOG: [VectorCombine] make load transform poison-safe

As noted in D93229, the transform from scalar load to vector load
potentially leaks poison from the extra vector elements that are
being loaded.

We could use freeze here (and x86 codegen at least appears to be
the same either way), but we already have a shuffle in this logic
to optionally change the vector size, so let's allow that
instruction to serve both purposes.

Differential Revision: https://reviews.llvm.org/D93238

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 19f5a2b432f7a..89b60045ce910 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -161,15 +161,17 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
   Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
 
-  // If the insert type does not match the target's minimum vector type,
-  // use an identity shuffle to shrink/grow the vector.
-  if (Ty != MinVecTy) {
-    unsigned OutputNumElts = Ty->getNumElements();
-    SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
-    for (unsigned i = 0; i < OutputNumElts && i < MinVecNumElts; ++i)
-      Mask[i] = i;
-    VecLd = Builder.CreateShuffleVector(VecLd, Mask);
-  }
+  // Set everything but element 0 to undef to prevent poison from propagating
+  // from the extra loaded memory. This will also optionally shrink/grow the
+  // vector from the loaded size to the output size.
+  // We assume this operation has no cost in codegen.
+  // Note that we could use freeze to avoid poison problems, but then we might
+  // still need a shuffle to change the vector size.
+  unsigned OutputNumElts = Ty->getNumElements();
+  SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
+  Mask[0] = 0;
+  VecLd = Builder.CreateShuffleVector(VecLd, Mask);
+
   replaceValue(I, *VecLd);
   ++NumVecLoad;
   return true;

diff  --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index ba2bf3f37d7b6..03902c48157fe 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -175,7 +175,8 @@ define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable
 define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -185,7 +186,8 @@ define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p
 
 define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %b = bitcast <4 x float>* %p to float*
@@ -199,7 +201,8 @@ define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenc
 define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %s = load i32, i32* %p, align 4
@@ -212,7 +215,8 @@ define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
 define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b = bitcast <16 x i8>* %p to i32*
@@ -225,7 +229,8 @@ define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceabl
 
 define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
@@ -238,7 +243,8 @@ define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenc
 
 define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace(
-; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0
@@ -253,7 +259,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
-; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
@@ -283,7 +290,8 @@ define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
-; CHECK-NEXT:    [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
@@ -410,7 +418,7 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v8i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %s = load i32, i32* %p, align 4
@@ -421,7 +429,7 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
 define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v8i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %b = bitcast <4 x i32>* %p to i32*
@@ -434,7 +442,7 @@ define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16)
 ; CHECK-LABEL: @load_f32_insert_v16f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <16 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -446,7 +454,7 @@ define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p
 ; CHECK-LABEL: @load_f32_insert_v2f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 undef>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -500,7 +508,8 @@ define void @PR47558_multiple_use_load(<2 x float>* nocapture nonnull %resultptr
 define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %l = load <2 x float>, <2 x float>* %p, align 4
@@ -512,7 +521,8 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 derefe
 define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %l = load <8 x float>, <8 x float>* %p, align 4