[llvm-branch-commits] [llvm] 67a2904 - [VectorCombine] Insert addrspacecast when crossing address space boundaries

Wed Apr 6 13:55:09 PDT 2022

Author: Fraser Cormack
Date: 2022-04-06T11:57:24-07:00
New Revision: 67a290460c374d5e0d18a06c798896cac0b19e59

URL: https://github.com/llvm/llvm-project/commit/67a290460c374d5e0d18a06c798896cac0b19e59
DIFF: https://github.com/llvm/llvm-project/commit/67a290460c374d5e0d18a06c798896cac0b19e59.diff

LOG: [VectorCombine] Insert addrspacecast when crossing address space boundaries

We can not bitcast pointers across different address spaces. This was
previously fixed in D89577 but then in D93229 an enhancement was added
which peeks further through the ponter operand, opening up the
possibility that address-space violations could be introduced.

Instead of bailing as the previous fix did, simply insert an
addrspacecast cast instruction.

Reviewed By: lebedev.ri

Differential Revision: https://reviews.llvm.org/D121787

(cherry picked from commit 2e44b7872bc638ed884ae4aa86e38b3b47e0b65a)

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll
    llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll
    llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 620d388199e0f..258f6c67e54d5 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -152,12 +152,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
   assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
 
-  // If original AS != Load's AS, we can't bitcast the original pointer and have
-  // to use Load's operand instead. Ideally we would want to strip pointer casts
-  // without changing AS, but there's no API to do that ATM.
   unsigned AS = Load->getPointerAddressSpace();
-  if (AS != SrcPtr->getType()->getPointerAddressSpace())
-    SrcPtr = Load->getPointerOperand();
 
   // We are potentially transforming byte-sized (8-bit) memory accesses, so make
   // sure we have all of our type-based constraints in place for this target.
@@ -245,7 +240,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // It is safe and potentially profitable to load a vector directly:
   // inselt undef, load Scalar, 0 --> load VecPtr
   IRBuilder<> Builder(Load);
-  Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
+  Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      SrcPtr, MinVecTy->getPointerTo(AS));
   Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
   VecLd = Builder.CreateShuffleVector(VecLd, Mask);
 

diff  --git a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll
index 688e3c4cf6c7a..b493c2a4f8bb5 100644
--- a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition-inseltpoison.ll
@@ -11,9 +11,7 @@ define protected amdgpu_kernel void @load_from_other_as(<4 x float>* nocapture n
 ; CHECK-LABEL: @load_from_other_as(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5)
-; CHECK-NEXT:    [[B:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to %struct.hoge*
-; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[B]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[C]] to <1 x float>*
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to <1 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x float>, <1 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    store <4 x float> [[E]], <4 x float>* [[RESULTPTR:%.*]], align 16

diff  --git a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll
index 3468cb540903e..8e36856714b24 100644
--- a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll
@@ -11,9 +11,7 @@ define protected amdgpu_kernel void @load_from_other_as(<4 x float>* nocapture n
 ; CHECK-LABEL: @load_from_other_as(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5)
-; CHECK-NEXT:    [[B:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to %struct.hoge*
-; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[B]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[C]] to <1 x float>*
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to <1 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x float>, <1 x float>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    store <4 x float> [[E]], <4 x float>* [[RESULTPTR:%.*]], align 16

diff  --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index cf5a6c0772f7f..e4ea44ae6e572 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -253,6 +253,23 @@ define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(
   ret <4 x float> %r
 }
 
+; Should work with addrspace even when peeking past unsafe loads through geps
+
+define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(i32* align 16 dereferenceable(16) %v3) {
+; CHECK-LABEL: @unsafe_load_i32_insert_v4i32_addrspace(
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast i32* [[V3:%.*]] to <4 x i32> addrspace(42)*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(42)* [[TMP1]], align 16
+; CHECK-NEXT:    [[INSELT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[INSELT]]
+;
+  %t0 = getelementptr inbounds i32, i32* %v3, i32 1
+  %t1 = addrspacecast i32* %t0 to i32 addrspace(42)*
+  %t2 = getelementptr inbounds i32, i32 addrspace(42)* %t1, i64 1
+  %val = load i32, i32 addrspace(42)* %t2, align 4
+  %inselt = insertelement <4 x i32> poison, i32 %val, i32 0
+  ret <4 x i32> %inselt
+}
+
 ; If there are enough dereferenceable bytes, we can offset the vector load.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) nofree nosync {