[llvm] [AMDGPU] Restrict promote alloca on pointers across address spaces (PR #119762)

Fri Dec 20 11:53:32 PST 2024

https://github.com/sgundapa updated https://github.com/llvm/llvm-project/pull/119762

>From 770364bcffeef16e27f65c81ef1f5d6803e16041 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sugundap at useocpm2m-387-002.amd.com>
Date: Thu, 12 Dec 2024 13:04:20 -0800
Subject: [PATCH 1/2] [AMDGPU] Restrict promote alloca on pointers across
 address spaces

If the load/store of a pointer to stack that is not in the same address space,
we restrict the promote alloca pass not to vectorize if the pointer storage
sizes are different.
Example: In address space 0, pointer size is 64 bits.
         In address space 5, pointer size if 32 bits.
Casting the pointer across these address spaces is undefined behavior.
Assertion found through fuzzing.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 10 +++
 .../AMDGPU/promote-alloca-loadstores.ll       | 67 ++++++++++---------
 .../CodeGen/AMDGPU/promote-alloca-subvecs.ll  | 51 +++++++-------
 3 files changed, 70 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e27ef71c1c0883..913a601b0e0888 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -796,6 +796,16 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       if (!IsSimple)
         return RejectUser(Inst, "not a simple load or store");
 
+      // If the access type is a pointer, reject the address spaces with
+      // different pointer sizes.
+      // store <2 x ptr> %arg, ptr addrspace(5) %alloca - Reject.
+      // %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca - ok.
+      if (AccessTy->isPtrOrPtrVectorTy()) {
+        if (DL->getPointerSize(getLoadStoreAddressSpace(Inst)) !=
+            DL->getPointerSize(AccessTy->getPointerAddressSpace()))
+          return RejectUser(Inst, "pointers to incompatible address spaces");
+      }
+
       Ptr = Ptr->stripPointerCasts();
 
       // Alloca already accessed as vector.
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
index 1e49500a243e10..2a22cdda7a7e79 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
@@ -93,21 +93,6 @@ end:
   ret void
 }
 
-define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
-; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
-; CHECK-SAME: (ptr [[ARG:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8>
-; CHECK-NEXT:    ret ptr [[ARG]]
-;
-entry:
-  %alloca = alloca [8 x i8], align 8, addrspace(5)
-  store ptr %arg, ptr addrspace(5) %alloca, align 8
-  %tmp = load ptr, ptr addrspace(5) %alloca, align 8
-  ret ptr %tmp
-}
-
 define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg) {
 ; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec
 ; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
@@ -123,22 +108,6 @@ entry:
   ret ptr addrspace(3) %tmp
 }
 
-define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
-; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
-; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)>
-; CHECK-NEXT:    ret <4 x ptr addrspace(3)> [[TMP2]]
-;
-entry:
-  %alloca = alloca [4 x i32], align 8, addrspace(5)
-  store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
-  %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
-  ret <4 x ptr addrspace(3)> %tmp
-}
-
 define <8 x i16> @ptralloca_load_store_ints_full(<2 x i64> %arg) {
 ; CHECK-LABEL: define <8 x i16> @ptralloca_load_store_ints_full
 ; CHECK-SAME: (<2 x i64> [[ARG:%.*]]) {
@@ -198,3 +167,39 @@ entry:
   %tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8
   ret ptr addrspace(3) %tmp
 }
+
+; Will not vectorize because we are doing a load/store of a pointer across
+; address spaces of varying pointer sizes.
+define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
+; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
+; CHECK-SAME: (ptr [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [8 x i8], align 8, addrspace(5)
+; CHECK-NEXT:    store ptr [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = load ptr, ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    ret ptr [[TMP]]
+;
+entry:
+  %alloca = alloca [8 x i8], align 8, addrspace(5)
+  store ptr %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load ptr, ptr addrspace(5) %alloca, align 8
+  ret ptr %tmp
+}
+
+; Will not vectorize because we are doing a load/store of a pointer across
+; address spaces of varying pointer sizes.
+define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
+; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
+; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4 x i32], align 8, addrspace(5)
+; CHECK-NEXT:    store <2 x ptr> [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = load <4 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    ret <4 x ptr addrspace(3)> [[TMP]]
+;
+entry:
+  %alloca = alloca [4 x i32], align 8, addrspace(5)
+  store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
+  ret <4 x ptr addrspace(3)> %tmp
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
index 7c5410004ed5b7..b583f33a7d9e63 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
@@ -218,38 +218,35 @@ entry:
   ret void
 }
 
-define void @test_different_type_subvector_ptrs(<2 x ptr addrspace(1)> %val.0, <4 x ptr addrspace(3)> %val.1) {
+define void @test_different_type_subvector_ptrs(<2 x ptr addrspace(3)> %val.0, <4 x ptr addrspace(3)> %val.1) {
 ; CHECK-LABEL: define void @test_different_type_subvector_ptrs
-; CHECK-SAME: (<2 x ptr addrspace(1)> [[VAL_0:%.*]], <4 x ptr addrspace(3)> [[VAL_1:%.*]]) {
+; CHECK-SAME: (<2 x ptr addrspace(3)> [[VAL_0:%.*]], <4 x ptr addrspace(3)> [[VAL_1:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(1)> [[VAL_0]] to <2 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP3]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x ptr addrspace(1)>
-; CHECK-NEXT:    [[DUMMYUSER:%.*]] = freeze <2 x ptr addrspace(1)> [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[VAL_1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <2 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[TMP12]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP12]], i64 1
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP15]] to <4 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr <4 x i32> [[TMP16]] to <4 x ptr addrspace(3)>
-; CHECK-NEXT:    [[DUMMYUSER_1:%.*]] = freeze <4 x ptr addrspace(3)> [[TMP17]]
-; CHECK-NEXT:    ret void
-;
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[VAL_0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> undef, i64 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr <2 x i32> [[TMP5]] to <2 x ptr addrspace(3)>
+; CHECK-NEXT:    [[DUMMYUSER:%.*]] = freeze <2 x ptr addrspace(3)> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[VAL_1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <2 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[TMP11]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr <4 x i32> [[TMP15]] to <4 x ptr addrspace(3)>
+; CHECK-NEXT:    [[DUMMYUSER_1:%.*]] = freeze <4 x ptr addrspace(3)> [[TMP16]]
 entry:
   %stack = alloca [4 x i64], align 4, addrspace(5)
 
-  store <2 x ptr addrspace(1)> %val.0, ptr addrspace(5) %stack
-  %reload = load <2 x ptr addrspace(1)>, ptr addrspace(5) %stack
-  %dummyuser = freeze <2 x ptr addrspace(1)> %reload
+  store <2 x ptr addrspace(3)> %val.0, ptr addrspace(5) %stack
+  %reload = load <2 x ptr addrspace(3)>, ptr addrspace(5) %stack
+  %dummyuser = freeze <2 x ptr addrspace(3)> %reload
 
   store <4 x ptr addrspace(3)> %val.1, ptr addrspace(5) %stack
   %reload.1 = load <4 x ptr addrspace(3)>, ptr addrspace(5) %stack

>From 02e37c67e4fb996c3386acd09c4165505953f53a Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Fri, 20 Dec 2024 11:51:51 -0800
Subject: [PATCH 2/2] [AMDGPU] Restrict promote alloca on pointers across
 address spaces

If the load/store of a pointer to stack that is not in the same address space,
we restrict the promote alloca pass not to vectorize if the pointer storage
sizes are different.
Example: In address space 0, pointer size is 64 bits.
         In address space 5, pointer size if 32 bits.
Casting the pointer across these address spaces with varied pointer sizes is undefined behavior
Assertion found through fuzzing.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 36 +++++----
 .../AMDGPU/promote-alloca-loadstores.ll       | 79 ++++++++++++-------
 .../CodeGen/AMDGPU/promote-alloca-subvecs.ll  | 51 ++++++------
 3 files changed, 99 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 913a601b0e0888..834c65ca726b45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -674,7 +674,17 @@ static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
   //
   // We could handle more complicated cases, but it'd make things a lot more
   // complicated.
-  if (isa<FixedVectorType>(AccessTy)) {
+
+  // If both are pointer types, verify if they are compatible to copy across
+  // address spaces.
+  bool canCopyAcrossAddressSpaces = true;
+  if (AccessTy->isPtrOrPtrVectorTy() && VecTy->isPtrOrPtrVectorTy()) {
+    if (DL.getPointerSize(AccessTy->getPointerAddressSpace()) !=
+        DL.getPointerSize(VecTy->getPointerAddressSpace()))
+      canCopyAcrossAddressSpaces = false;
+  }
+
+  if (isa<FixedVectorType>(AccessTy) && canCopyAcrossAddressSpaces) {
     TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
     TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
     return AccTS.isKnownMultipleOf(VecTS);
@@ -796,23 +806,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
       if (!IsSimple)
         return RejectUser(Inst, "not a simple load or store");
 
-      // If the access type is a pointer, reject the address spaces with
-      // different pointer sizes.
-      // store <2 x ptr> %arg, ptr addrspace(5) %alloca - Reject.
-      // %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca - ok.
-      if (AccessTy->isPtrOrPtrVectorTy()) {
-        if (DL->getPointerSize(getLoadStoreAddressSpace(Inst)) !=
-            DL->getPointerSize(AccessTy->getPointerAddressSpace()))
-          return RejectUser(Inst, "pointers to incompatible address spaces");
-      }
-
       Ptr = Ptr->stripPointerCasts();
 
-      // Alloca already accessed as vector.
-      if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
-                                DL->getTypeStoreSize(AccessTy)) {
-        WorkList.push_back(Inst);
-        continue;
+      // Since the size of pointer is different across address spaces, let the
+      // isSupportedAccessType() handle the pointer load and store accesses.
+      if (!AccessTy->isPtrOrPtrVectorTy() || !VectorTy->isPtrOrPtrVectorTy()) {
+        // Alloca already accessed as vector.
+        if (Ptr == &Alloca &&
+            DL->getTypeStoreSize(AllocaTy) == DL->getTypeStoreSize(AccessTy)) {
+          WorkList.push_back(Inst);
+          continue;
+        }
       }
 
       if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
index 2a22cdda7a7e79..9dde947b0b0c65 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
@@ -93,6 +93,21 @@ end:
   ret void
 }
 
+define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
+; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
+; CHECK-SAME: (ptr [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8>
+; CHECK-NEXT:    ret ptr [[ARG]]
+;
+entry:
+  %alloca = alloca [8 x i8], align 8, addrspace(5)
+  store ptr %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load ptr, ptr addrspace(5) %alloca, align 8
+  ret ptr %tmp
+}
+
 define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg) {
 ; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec
 ; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
@@ -108,6 +123,22 @@ entry:
   ret ptr addrspace(3) %tmp
 }
 
+define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
+; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
+; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)>
+; CHECK-NEXT:    ret <4 x ptr addrspace(3)> [[TMP2]]
+;
+entry:
+  %alloca = alloca [4 x i32], align 8, addrspace(5)
+  store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
+  ret <4 x ptr addrspace(3)> %tmp
+}
+
 define <8 x i16> @ptralloca_load_store_ints_full(<2 x i64> %arg) {
 ; CHECK-LABEL: define <8 x i16> @ptralloca_load_store_ints_full
 ; CHECK-SAME: (<2 x i64> [[ARG:%.*]]) {
@@ -168,38 +199,32 @@ entry:
   ret ptr addrspace(3) %tmp
 }
 
-; Will not vectorize because we are doing a load/store of a pointer across
-; address spaces of varying pointer sizes.
-define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
-; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
-; CHECK-SAME: (ptr [[ARG:%.*]]) {
+; Will not vectorize because we're saving a 64-bit pointer from addrspace 0
+; in to two 32 bits pointers of addrspace 5.
+; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_addrspace_ptrvec
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [8 x i8], align 8, addrspace(5)
-; CHECK-NEXT:    store ptr [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
-; CHECK-NEXT:    [[TMP:%.*]] = load ptr, ptr addrspace(5) [[ALLOCA]], align 8
-; CHECK-NEXT:    ret ptr [[TMP]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca <2 x ptr addrspace(5)>, align 8, addrspace(5)
+; CHECK-NEXT:    store ptr undef, ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    ret void
 ;
+define void @alloca_load_store_ptr_mixed_addrspace_ptrvec() {
 entry:
-  %alloca = alloca [8 x i8], align 8, addrspace(5)
-  store ptr %arg, ptr addrspace(5) %alloca, align 8
-  %tmp = load ptr, ptr addrspace(5) %alloca, align 8
-  ret ptr %tmp
+  %A2 = alloca <2 x ptr addrspace(5)>, align 8, addrspace(5)
+  store  ptr undef, ptr addrspace(5) %A2, align 8
+  ret void
 }
 
-; Will not vectorize because we are doing a load/store of a pointer across
-; address spaces of varying pointer sizes.
-define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
-; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
-; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
+; Will not vectorize because we're saving a 32-bit pointers from addrspace 5
+; in to two 64 bits pointers of addrspace 0, even though the size in memory
+; is same.
+; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_addrspace_ptrvec2
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4 x i32], align 8, addrspace(5)
-; CHECK-NEXT:    store <2 x ptr> [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
-; CHECK-NEXT:    [[TMP:%.*]] = load <4 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8
-; CHECK-NEXT:    ret <4 x ptr addrspace(3)> [[TMP]]
-;
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca <2 x ptr>, align 8
+; CHECK-NEXT:    store <4 x ptr addrspace(5)> undef, ptr  [[ALLOCA]], align 8
+; CHECK-NEXT:    ret void
+define void @alloca_load_store_ptr_mixed_addrspace_ptrvec2() {
 entry:
-  %alloca = alloca [4 x i32], align 8, addrspace(5)
-  store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
-  %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
-  ret <4 x ptr addrspace(3)> %tmp
+  %A2 = alloca <2 x ptr>, align 8
+  store <4 x ptr addrspace(5)> undef, ptr %A2, align 8
+  ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
index b583f33a7d9e63..7c5410004ed5b7 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll
@@ -218,35 +218,38 @@ entry:
   ret void
 }
 
-define void @test_different_type_subvector_ptrs(<2 x ptr addrspace(3)> %val.0, <4 x ptr addrspace(3)> %val.1) {
+define void @test_different_type_subvector_ptrs(<2 x ptr addrspace(1)> %val.0, <4 x ptr addrspace(3)> %val.1) {
 ; CHECK-LABEL: define void @test_different_type_subvector_ptrs
-; CHECK-SAME: (<2 x ptr addrspace(3)> [[VAL_0:%.*]], <4 x ptr addrspace(3)> [[VAL_1:%.*]]) {
+; CHECK-SAME: (<2 x ptr addrspace(1)> [[VAL_0:%.*]], <4 x ptr addrspace(3)> [[VAL_1:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[VAL_0]] to <2 x i32>
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> undef, i64 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr <2 x i32> [[TMP5]] to <2 x ptr addrspace(3)>
-; CHECK-NEXT:    [[DUMMYUSER:%.*]] = freeze <2 x ptr addrspace(3)> [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[VAL_1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <2 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP8]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP13]], i64 [[TMP11]], i64 1
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP14]] to <4 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr <4 x i32> [[TMP15]] to <4 x ptr addrspace(3)>
-; CHECK-NEXT:    [[DUMMYUSER_1:%.*]] = freeze <4 x ptr addrspace(3)> [[TMP16]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(1)> [[VAL_0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x ptr addrspace(1)>
+; CHECK-NEXT:    [[DUMMYUSER:%.*]] = freeze <2 x ptr addrspace(1)> [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[VAL_1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <2 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP12]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr <4 x i32> [[TMP16]] to <4 x ptr addrspace(3)>
+; CHECK-NEXT:    [[DUMMYUSER_1:%.*]] = freeze <4 x ptr addrspace(3)> [[TMP17]]
+; CHECK-NEXT:    ret void
+;
 entry:
   %stack = alloca [4 x i64], align 4, addrspace(5)
 
-  store <2 x ptr addrspace(3)> %val.0, ptr addrspace(5) %stack
-  %reload = load <2 x ptr addrspace(3)>, ptr addrspace(5) %stack
-  %dummyuser = freeze <2 x ptr addrspace(3)> %reload
+  store <2 x ptr addrspace(1)> %val.0, ptr addrspace(5) %stack
+  %reload = load <2 x ptr addrspace(1)>, ptr addrspace(5) %stack
+  %dummyuser = freeze <2 x ptr addrspace(1)> %reload
 
   store <4 x ptr addrspace(3)> %val.1, ptr addrspace(5) %stack
   %reload.1 = load <4 x ptr addrspace(3)>, ptr addrspace(5) %stack