[llvm] 0d18d36 - [LoadStoreVectorizer] Convert tests to opaque pointers (NFC)

Tue Dec 27 04:14:21 PST 2022

Author: Nikita Popov
Date: 2022-12-27T13:13:56+01:00
New Revision: 0d18d36b18316bc49f94d6cfca3f871bfbca086a

URL: https://github.com/llvm/llvm-project/commit/0d18d36b18316bc49f94d6cfca3f871bfbca086a
DIFF: https://github.com/llvm/llvm-project/commit/0d18d36b18316bc49f94d6cfca3f871bfbca086a.diff

LOG: [LoadStoreVectorizer] Convert tests to opaque pointers (NFC)

Added: 
    

Modified: 
    llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
    llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
    llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
    llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
    llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
    llvm/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll
    llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
    llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
index 83ff222fa391..3c2eea39689d 100644

--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
@@ -6,30 +6,29 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 
 ; This fails to vectorize if the !alias.scope is not used
 
-define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @vectorize_alias_scope(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture %b, ptr addrspace(1) nocapture readonly %c) #0 {
 ; SCOPE-LABEL: @vectorize_alias_scope(
 ; SCOPE-NEXT:  entry:
-; SCOPE-NEXT:    [[LD_C:%.*]] = load float, float addrspace(1)* [[C:%.*]], align 4, !alias.scope !0
-; SCOPE-NEXT:    [[TMP0:%.*]] = bitcast float addrspace(1)* [[A:%.*]] to <2 x float> addrspace(1)*
-; SCOPE-NEXT:    store <2 x float> zeroinitializer, <2 x float> addrspace(1)* [[TMP0]], align 4, !noalias !0
-; SCOPE-NEXT:    store float [[LD_C]], float addrspace(1)* [[B:%.*]], align 4, !noalias !0
+; SCOPE-NEXT:    [[LD_C:%.*]] = load float, ptr addrspace(1) [[C:%.*]], align 4, !alias.scope !0
+; SCOPE-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[A:%.*]], align 4, !noalias !0
+; SCOPE-NEXT:    store float [[LD_C]], ptr addrspace(1) [[B:%.*]], align 4, !noalias !0
 ; SCOPE-NEXT:    ret void
 ;
 ; NOSCOPE-LABEL: @vectorize_alias_scope(
 ; NOSCOPE-NEXT:  entry:
-; NOSCOPE-NEXT:    [[A_IDX_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[A:%.*]], i64 1
-; NOSCOPE-NEXT:    store float 0.000000e+00, float addrspace(1)* [[A]], align 4, !noalias !0
-; NOSCOPE-NEXT:    [[LD_C:%.*]] = load float, float addrspace(1)* [[C:%.*]], align 4, !alias.scope !0
-; NOSCOPE-NEXT:    store float 0.000000e+00, float addrspace(1)* [[A_IDX_1]], align 4, !noalias !0
-; NOSCOPE-NEXT:    store float [[LD_C]], float addrspace(1)* [[B:%.*]], align 4, !noalias !0
+; NOSCOPE-NEXT:    [[A_IDX_1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A:%.*]], i64 1
+; NOSCOPE-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[A]], align 4, !noalias !0
+; NOSCOPE-NEXT:    [[LD_C:%.*]] = load float, ptr addrspace(1) [[C:%.*]], align 4, !alias.scope !0
+; NOSCOPE-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[A_IDX_1]], align 4, !noalias !0
+; NOSCOPE-NEXT:    store float [[LD_C]], ptr addrspace(1) [[B:%.*]], align 4, !noalias !0
 ; NOSCOPE-NEXT:    ret void
 ;
 entry:
-  %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  store float 0.0, float addrspace(1)* %a, align 4, !noalias !0
-  %ld.c = load float, float addrspace(1)* %c, align 4, !alias.scope !0
-  store float 0.0, float addrspace(1)* %a.idx.1, align 4, !noalias !0
-  store float %ld.c, float addrspace(1)* %b, align 4, !noalias !0
+  %a.idx.1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  store float 0.0, ptr addrspace(1) %a, align 4, !noalias !0
+  %ld.c = load float, ptr addrspace(1) %c, align 4, !alias.scope !0
+  store float 0.0, ptr addrspace(1) %a.idx.1, align 4, !noalias !0
+  store float %ld.c, ptr addrspace(1) %b, align 4, !noalias !0
   ret void
 }
 

diff  --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
index 6bb16ccd7233..26f9bcfdf550 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
@@ -7,250 +7,235 @@
 target triple = "amdgcn--"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
-define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i8(ptr addrspace(1) noalias %out, i32 %offset) #0 {
 ; ALIGNED-LABEL: @load_unknown_offset_align1_i8(
 ; ALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
-; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; ALIGNED-NEXT:    [[VAL0:%.*]] = load i8, i8 addrspace(5)* [[PTR0]], align 1
-; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[PTR0]], i32 1
-; ALIGNED-NEXT:    [[VAL1:%.*]] = load i8, i8 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT:    [[VAL0:%.*]] = load i8, ptr addrspace(5) [[PTR0]], align 1
+; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[PTR0]], i32 1
+; ALIGNED-NEXT:    [[VAL1:%.*]] = load i8, ptr addrspace(5) [[PTR1]], align 1
 ; ALIGNED-NEXT:    [[ADD:%.*]] = add i8 [[VAL0]], [[VAL1]]
-; ALIGNED-NEXT:    store i8 [[ADD]], i8 addrspace(1)* [[OUT:%.*]], align 1
+; ALIGNED-NEXT:    store i8 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; ALIGNED-NEXT:    ret void
 ;
 ; UNALIGNED-LABEL: @load_unknown_offset_align1_i8(
 ; UNALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
-; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; UNALIGNED-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[PTR0]] to <2 x i8> addrspace(5)*
-; UNALIGNED-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr addrspace(5) [[PTR0]], align 1
 ; UNALIGNED-NEXT:    [[VAL01:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
 ; UNALIGNED-NEXT:    [[VAL12:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
 ; UNALIGNED-NEXT:    [[ADD:%.*]] = add i8 [[VAL01]], [[VAL12]]
-; UNALIGNED-NEXT:    store i8 [[ADD]], i8 addrspace(1)* [[OUT:%.*]], align 1
+; UNALIGNED-NEXT:    store i8 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 1
 ; UNALIGNED-NEXT:    ret void
 ;
   %alloca = alloca [128 x i8], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset
-  %val0 = load i8, i8 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1
-  %val1 = load i8, i8 addrspace(5)* %ptr1, align 1
+  %ptr0 = getelementptr inbounds [128 x i8], ptr addrspace(5) %alloca, i32 0, i32 %offset
+  %val0 = load i8, ptr addrspace(5) %ptr0, align 1
+  %ptr1 = getelementptr inbounds i8, ptr addrspace(5) %ptr0, i32 1
+  %val1 = load i8, ptr addrspace(5) %ptr1, align 1
   %add = add i8 %val0, %val1
-  store i8 %add, i8 addrspace(1)* %out
+  store i8 %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i16(ptr addrspace(1) noalias %out, i32 %offset) #0 {
 ; ALIGNED-LABEL: @load_unknown_offset_align1_i16(
 ; ALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
-; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; ALIGNED-NEXT:    [[VAL0:%.*]] = load i16, i16 addrspace(5)* [[PTR0]], align 1
-; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[PTR0]], i32 1
-; ALIGNED-NEXT:    [[VAL1:%.*]] = load i16, i16 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i16], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT:    [[VAL0:%.*]] = load i16, ptr addrspace(5) [[PTR0]], align 1
+; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[PTR0]], i32 1
+; ALIGNED-NEXT:    [[VAL1:%.*]] = load i16, ptr addrspace(5) [[PTR1]], align 1
 ; ALIGNED-NEXT:    [[ADD:%.*]] = add i16 [[VAL0]], [[VAL1]]
-; ALIGNED-NEXT:    store i16 [[ADD]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; ALIGNED-NEXT:    store i16 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; ALIGNED-NEXT:    ret void
 ;
 ; UNALIGNED-LABEL: @load_unknown_offset_align1_i16(
 ; UNALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
-; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; UNALIGNED-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(5)* [[PTR0]] to <2 x i16> addrspace(5)*
-; UNALIGNED-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i16], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[PTR0]], align 1
 ; UNALIGNED-NEXT:    [[VAL01:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
 ; UNALIGNED-NEXT:    [[VAL12:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
 ; UNALIGNED-NEXT:    [[ADD:%.*]] = add i16 [[VAL01]], [[VAL12]]
-; UNALIGNED-NEXT:    store i16 [[ADD]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; UNALIGNED-NEXT:    store i16 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 2
 ; UNALIGNED-NEXT:    ret void
 ;
   %alloca = alloca [128 x i16], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset
-  %val0 = load i16, i16 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1
-  %val1 = load i16, i16 addrspace(5)* %ptr1, align 1
+  %ptr0 = getelementptr inbounds [128 x i16], ptr addrspace(5) %alloca, i32 0, i32 %offset
+  %val0 = load i16, ptr addrspace(5) %ptr0, align 1
+  %ptr1 = getelementptr inbounds i16, ptr addrspace(5) %ptr0, i32 1
+  %val1 = load i16, ptr addrspace(5) %ptr1, align 1
   %add = add i16 %val0, %val1
-  store i16 %add, i16 addrspace(1)* %out
+  store i16 %add, ptr addrspace(1) %out
   ret void
 }
 
 ; FIXME: Although the offset is unknown here, we know it is a multiple
 ; of the element size, so should still be align 4
-define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i32(ptr addrspace(1) noalias %out, i32 %offset) #0 {
 ; ALIGNED-LABEL: @load_unknown_offset_align1_i32(
 ; ALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
-; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; ALIGNED-NEXT:    [[VAL0:%.*]] = load i32, i32 addrspace(5)* [[PTR0]], align 1
-; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i32, i32 addrspace(5)* [[PTR0]], i32 1
-; ALIGNED-NEXT:    [[VAL1:%.*]] = load i32, i32 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT:    [[VAL0:%.*]] = load i32, ptr addrspace(5) [[PTR0]], align 1
+; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[PTR0]], i32 1
+; ALIGNED-NEXT:    [[VAL1:%.*]] = load i32, ptr addrspace(5) [[PTR1]], align 1
 ; ALIGNED-NEXT:    [[ADD:%.*]] = add i32 [[VAL0]], [[VAL1]]
-; ALIGNED-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; ALIGNED-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; ALIGNED-NEXT:    ret void
 ;
 ; UNALIGNED-LABEL: @load_unknown_offset_align1_i32(
 ; UNALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
-; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; UNALIGNED-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(5)* [[PTR0]] to <2 x i32> addrspace(5)*
-; UNALIGNED-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[PTR0]], align 1
 ; UNALIGNED-NEXT:    [[VAL01:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
 ; UNALIGNED-NEXT:    [[VAL12:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
 ; UNALIGNED-NEXT:    [[ADD:%.*]] = add i32 [[VAL01]], [[VAL12]]
-; UNALIGNED-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; UNALIGNED-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; UNALIGNED-NEXT:    ret void
 ;
   %alloca = alloca [128 x i32], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
-  %val0 = load i32, i32 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1
-  %val1 = load i32, i32 addrspace(5)* %ptr1, align 1
+  %ptr0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca, i32 0, i32 %offset
+  %val0 = load i32, ptr addrspace(5) %ptr0, align 1
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(5) %ptr0, i32 1
+  %val1 = load i32, ptr addrspace(5) %ptr1, align 1
   %add = add i32 %val0, %val1
-  store i32 %add, i32 addrspace(1)* %out
+  store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
 ; Make sure alloca alignment isn't decreased
-define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(ptr addrspace(1) noalias %out, i32 %offset) #0 {
 ; CHECK-LABEL: @load_alloca16_unknown_offset_align1_i32(
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i32], align 16, addrspace(5)
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(5)* [[PTR0]] to <2 x i32> addrspace(5)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32> addrspace(5)* [[TMP1]], align 4
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[PTR0]], align 4
 ; CHECK-NEXT:    [[VAL01:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[VAL12:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[VAL01]], [[VAL12]]
-; CHECK-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca [128 x i32], align 16, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
-  %val0 = load i32, i32 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1
-  %val1 = load i32, i32 addrspace(5)* %ptr1, align 1
+  %ptr0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca, i32 0, i32 %offset
+  %val0 = load i32, ptr addrspace(5) %ptr0, align 1
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(5) %ptr0, i32 1
+  %val1 = load i32, ptr addrspace(5) %ptr1, align 1
   %add = add i32 %val0, %val1
-  store i32 %add, i32 addrspace(1)* %out
+  store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i8(ptr addrspace(1) noalias %out, i32 %offset) #0 {
 ; ALIGNED-LABEL: @store_unknown_offset_align1_i8(
 ; ALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
-; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; ALIGNED-NEXT:    store i8 9, i8 addrspace(5)* [[PTR0]], align 1
-; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[PTR0]], i32 1
-; ALIGNED-NEXT:    store i8 10, i8 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT:    store i8 9, ptr addrspace(5) [[PTR0]], align 1
+; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[PTR0]], i32 1
+; ALIGNED-NEXT:    store i8 10, ptr addrspace(5) [[PTR1]], align 1
 ; ALIGNED-NEXT:    ret void
 ;
 ; UNALIGNED-LABEL: @store_unknown_offset_align1_i8(
 ; UNALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
-; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; UNALIGNED-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[PTR0]] to <2 x i8> addrspace(5)*
-; UNALIGNED-NEXT:    store <2 x i8> <i8 9, i8 10>, <2 x i8> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT:    store <2 x i8> <i8 9, i8 10>, ptr addrspace(5) [[PTR0]], align 1
 ; UNALIGNED-NEXT:    ret void
 ;
   %alloca = alloca [128 x i8], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset
-  store i8 9, i8 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1
-  store i8 10, i8 addrspace(5)* %ptr1, align 1
+  %ptr0 = getelementptr inbounds [128 x i8], ptr addrspace(5) %alloca, i32 0, i32 %offset
+  store i8 9, ptr addrspace(5) %ptr0, align 1
+  %ptr1 = getelementptr inbounds i8, ptr addrspace(5) %ptr0, i32 1
+  store i8 10, ptr addrspace(5) %ptr1, align 1
   ret void
 }
 
-define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i16(ptr addrspace(1) noalias %out, i32 %offset) #0 {
 ; ALIGNED-LABEL: @store_unknown_offset_align1_i16(
 ; ALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
-; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; ALIGNED-NEXT:    store i16 9, i16 addrspace(5)* [[PTR0]], align 1
-; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[PTR0]], i32 1
-; ALIGNED-NEXT:    store i16 10, i16 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i16], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT:    store i16 9, ptr addrspace(5) [[PTR0]], align 1
+; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[PTR0]], i32 1
+; ALIGNED-NEXT:    store i16 10, ptr addrspace(5) [[PTR1]], align 1
 ; ALIGNED-NEXT:    ret void
 ;
 ; UNALIGNED-LABEL: @store_unknown_offset_align1_i16(
 ; UNALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
-; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; UNALIGNED-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(5)* [[PTR0]] to <2 x i16> addrspace(5)*
-; UNALIGNED-NEXT:    store <2 x i16> <i16 9, i16 10>, <2 x i16> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i16], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT:    store <2 x i16> <i16 9, i16 10>, ptr addrspace(5) [[PTR0]], align 1
 ; UNALIGNED-NEXT:    ret void
 ;
   %alloca = alloca [128 x i16], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset
-  store i16 9, i16 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1
-  store i16 10, i16 addrspace(5)* %ptr1, align 1
+  %ptr0 = getelementptr inbounds [128 x i16], ptr addrspace(5) %alloca, i32 0, i32 %offset
+  store i16 9, ptr addrspace(5) %ptr0, align 1
+  %ptr1 = getelementptr inbounds i16, ptr addrspace(5) %ptr0, i32 1
+  store i16 10, ptr addrspace(5) %ptr1, align 1
   ret void
 }
 
 ; FIXME: Although the offset is unknown here, we know it is a multiple
 ; of the element size, so it still should be align 4.
 
-define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i32(ptr addrspace(1) noalias %out, i32 %offset) #0 {
 ; ALIGNED-LABEL: @store_unknown_offset_align1_i32(
 ; ALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
-; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; ALIGNED-NEXT:    store i32 9, i32 addrspace(5)* [[PTR0]], align 1
-; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i32, i32 addrspace(5)* [[PTR0]], i32 1
-; ALIGNED-NEXT:    store i32 10, i32 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT:    store i32 9, ptr addrspace(5) [[PTR0]], align 1
+; ALIGNED-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[PTR0]], i32 1
+; ALIGNED-NEXT:    store i32 10, ptr addrspace(5) [[PTR1]], align 1
 ; ALIGNED-NEXT:    ret void
 ;
 ; UNALIGNED-LABEL: @store_unknown_offset_align1_i32(
 ; UNALIGNED-NEXT:    [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
-; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
-; UNALIGNED-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(5)* [[PTR0]] to <2 x i32> addrspace(5)*
-; UNALIGNED-NEXT:    store <2 x i32> <i32 9, i32 10>, <2 x i32> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT:    store <2 x i32> <i32 9, i32 10>, ptr addrspace(5) [[PTR0]], align 1
 ; UNALIGNED-NEXT:    ret void
 ;
   %alloca = alloca [128 x i32], align 1, addrspace(5)
-  %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
-  store i32 9, i32 addrspace(5)* %ptr0, align 1
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1
-  store i32 10, i32 addrspace(5)* %ptr1, align 1
+  %ptr0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca, i32 0, i32 %offset
+  store i32 9, ptr addrspace(5) %ptr0, align 1
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(5) %ptr0, i32 1
+  store i32 10, ptr addrspace(5) %ptr1, align 1
   ret void
 }
 
 define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() {
 ; CHECK-LABEL: @merge_private_store_4_vector_elts_loads_v4i32(
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [8 x i32], align 4, addrspace(5)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast [8 x i32] addrspace(5)* [[ALLOCA]] to i32 addrspace(5)*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(5)* [[OUT]] to <4 x i32> addrspace(5)*
-; CHECK-NEXT:    store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* [[TMP1]], align 4
+; CHECK-NEXT:    store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, ptr addrspace(5) [[ALLOCA]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca [8 x i32], align 1, addrspace(5)
-  %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
+  %out.gep.1 = getelementptr i32, ptr addrspace(5) %alloca, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(5) %alloca, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(5) %alloca, i32 3
 
-  store i32 9, i32 addrspace(5)* %out, align 1
-  store i32 1, i32 addrspace(5)* %out.gep.1, align 1
-  store i32 23, i32 addrspace(5)* %out.gep.2, align 1
-  store i32 19, i32 addrspace(5)* %out.gep.3, align 1
+  store i32 9, ptr addrspace(5) %alloca, align 1
+  store i32 1, ptr addrspace(5) %out.gep.1, align 1
+  store i32 23, ptr addrspace(5) %out.gep.2, align 1
+  store i32 19, ptr addrspace(5) %out.gep.3, align 1
   ret void
 }
 
 define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() {
 ; CHECK-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [8 x i8], align 4, addrspace(5)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast [8 x i8] addrspace(5)* [[ALLOCA]] to i8 addrspace(5)*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[OUT]] to <4 x i8> addrspace(5)*
-; CHECK-NEXT:    store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* [[TMP1]], align 4
+; CHECK-NEXT:    store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, ptr addrspace(5) [[ALLOCA]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca [8 x i8], align 1, addrspace(5)
-  %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
-  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
+  %out.gep.1 = getelementptr i8, ptr addrspace(5) %alloca, i8 1
+  %out.gep.2 = getelementptr i8, ptr addrspace(5) %alloca, i8 2
+  %out.gep.3 = getelementptr i8, ptr addrspace(5) %alloca, i8 3
 
-  store i8 9, i8 addrspace(5)* %out, align 1
-  store i8 1, i8 addrspace(5)* %out.gep.1, align 1
-  store i8 23, i8 addrspace(5)* %out.gep.2, align 1
-  store i8 19, i8 addrspace(5)* %out.gep.3, align 1
+  store i8 9, ptr addrspace(5) %alloca, align 1
+  store i8 1, ptr addrspace(5) %out.gep.1, align 1
+  store i8 23, ptr addrspace(5) %out.gep.2, align 1
+  store i8 19, ptr addrspace(5) %out.gep.3, align 1
   ret void
 }
 
 define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() {
 ; CHECK-LABEL: @merge_private_load_4_vector_elts_loads_v4i32(
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [8 x i32], align 4, addrspace(5)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast [8 x i32] addrspace(5)* [[ALLOCA]] to i32 addrspace(5)*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(5)* [[OUT]] to <4 x i32> addrspace(5)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(5)* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(5) [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[LOAD01:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[LOAD12:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[LOAD23:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
@@ -258,24 +243,21 @@ define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() {
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca [8 x i32], align 1, addrspace(5)
-  %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
+  %out.gep.1 = getelementptr i32, ptr addrspace(5) %alloca, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(5) %alloca, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(5) %alloca, i32 3
 
-  %load0 = load i32, i32 addrspace(5)* %out, align 1
-  %load1 = load i32, i32 addrspace(5)* %out.gep.1, align 1
-  %load2 = load i32, i32 addrspace(5)* %out.gep.2, align 1
-  %load3 = load i32, i32 addrspace(5)* %out.gep.3, align 1
+  %load0 = load i32, ptr addrspace(5) %alloca, align 1
+  %load1 = load i32, ptr addrspace(5) %out.gep.1, align 1
+  %load2 = load i32, ptr addrspace(5) %out.gep.2, align 1
+  %load3 = load i32, ptr addrspace(5) %out.gep.3, align 1
   ret void
 }
 
 define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() {
 ; CHECK-LABEL: @merge_private_load_4_vector_elts_loads_v4i8(
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [8 x i8], align 4, addrspace(5)
-; CHECK-NEXT:    [[OUT:%.*]] = bitcast [8 x i8] addrspace(5)* [[ALLOCA]] to i8 addrspace(5)*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[OUT]] to <4 x i8> addrspace(5)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8> addrspace(5)* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr addrspace(5) [[ALLOCA]], align 4
 ; CHECK-NEXT:    [[LOAD01:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[LOAD12:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[LOAD23:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
@@ -283,64 +265,62 @@ define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() {
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca [8 x i8], align 1, addrspace(5)
-  %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
-  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
+  %out.gep.1 = getelementptr i8, ptr addrspace(5) %alloca, i8 1
+  %out.gep.2 = getelementptr i8, ptr addrspace(5) %alloca, i8 2
+  %out.gep.3 = getelementptr i8, ptr addrspace(5) %alloca, i8 3
 
-  %load0 = load i8, i8 addrspace(5)* %out, align 1
-  %load1 = load i8, i8 addrspace(5)* %out.gep.1, align 1
-  %load2 = load i8, i8 addrspace(5)* %out.gep.2, align 1
-  %load3 = load i8, i8 addrspace(5)* %out.gep.3, align 1
+  %load0 = load i8, ptr addrspace(5) %alloca, align 1
+  %load1 = load i8, ptr addrspace(5) %out.gep.1, align 1
+  %load2 = load i8, ptr addrspace(5) %out.gep.2, align 1
+  %load3 = load i8, ptr addrspace(5) %out.gep.3, align 1
   ret void
 }
 
 ; Make sure we don't think the alignment will increase if the base address isn't an alloca
-define void @private_store_2xi16_align2_not_alloca(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 {
+define void @private_store_2xi16_align2_not_alloca(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
 ; CHECK-LABEL: @private_store_2xi16_align2_not_alloca(
-; CHECK-NEXT:    [[GEP_R:%.*]] = getelementptr i16, i16 addrspace(5)* [[R:%.*]], i32 1
-; CHECK-NEXT:    store i16 1, i16 addrspace(5)* [[R]], align 2
-; CHECK-NEXT:    store i16 2, i16 addrspace(5)* [[GEP_R]], align 2
+; CHECK-NEXT:    [[GEP_R:%.*]] = getelementptr i16, ptr addrspace(5) [[R:%.*]], i32 1
+; CHECK-NEXT:    store i16 1, ptr addrspace(5) [[R]], align 2
+; CHECK-NEXT:    store i16 2, ptr addrspace(5) [[GEP_R]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  %gep.r = getelementptr i16, i16 addrspace(5)* %r, i32 1
-  store i16 1, i16 addrspace(5)* %r, align 2
-  store i16 2, i16 addrspace(5)* %gep.r, align 2
+  %gep.r = getelementptr i16, ptr addrspace(5) %r, i32 1
+  store i16 1, ptr addrspace(5) %r, align 2
+  store i16 2, ptr addrspace(5) %gep.r, align 2
   ret void
 }
 
-define void @private_store_2xi16_align1_not_alloca(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 {
+define void @private_store_2xi16_align1_not_alloca(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
 ; ALIGNED-LABEL: @private_store_2xi16_align1_not_alloca(
-; ALIGNED-NEXT:    [[GEP_R:%.*]] = getelementptr i16, i16 addrspace(5)* [[R:%.*]], i32 1
-; ALIGNED-NEXT:    store i16 1, i16 addrspace(5)* [[R]], align 1
-; ALIGNED-NEXT:    store i16 2, i16 addrspace(5)* [[GEP_R]], align 1
+; ALIGNED-NEXT:    [[GEP_R:%.*]] = getelementptr i16, ptr addrspace(5) [[R:%.*]], i32 1
+; ALIGNED-NEXT:    store i16 1, ptr addrspace(5) [[R]], align 1
+; ALIGNED-NEXT:    store i16 2, ptr addrspace(5) [[GEP_R]], align 1
 ; ALIGNED-NEXT:    ret void
 ;
 ; UNALIGNED-LABEL: @private_store_2xi16_align1_not_alloca(
-; UNALIGNED-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(5)* [[R:%.*]] to <2 x i16> addrspace(5)*
-; UNALIGNED-NEXT:    store <2 x i16> <i16 1, i16 2>, <2 x i16> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT:    store <2 x i16> <i16 1, i16 2>, ptr addrspace(5) [[R:%.*]], align 1
 ; UNALIGNED-NEXT:    ret void
 ;
-  %gep.r = getelementptr i16, i16 addrspace(5)* %r, i32 1
-  store i16 1, i16 addrspace(5)* %r, align 1
-  store i16 2, i16 addrspace(5)* %gep.r, align 1
+  %gep.r = getelementptr i16, ptr addrspace(5) %r, i32 1
+  store i16 1, ptr addrspace(5) %r, align 1
+  store i16 2, ptr addrspace(5) %gep.r, align 1
   ret void
 }
 
-define i32 @private_load_2xi16_align2_not_alloca(i16 addrspace(5)* %p) #0 {
+define i32 @private_load_2xi16_align2_not_alloca(ptr addrspace(5) %p) #0 {
 ; CHECK-LABEL: @private_load_2xi16_align2_not_alloca(
-; CHECK-NEXT:    [[GEP_P:%.*]] = getelementptr i16, i16 addrspace(5)* [[P:%.*]], i64 1
-; CHECK-NEXT:    [[P_0:%.*]] = load i16, i16 addrspace(5)* [[P]], align 2
-; CHECK-NEXT:    [[P_1:%.*]] = load i16, i16 addrspace(5)* [[GEP_P]], align 2
+; CHECK-NEXT:    [[GEP_P:%.*]] = getelementptr i16, ptr addrspace(5) [[P:%.*]], i64 1
+; CHECK-NEXT:    [[P_0:%.*]] = load i16, ptr addrspace(5) [[P]], align 2
+; CHECK-NEXT:    [[P_1:%.*]] = load i16, ptr addrspace(5) [[GEP_P]], align 2
 ; CHECK-NEXT:    [[ZEXT_0:%.*]] = zext i16 [[P_0]] to i32
 ; CHECK-NEXT:    [[ZEXT_1:%.*]] = zext i16 [[P_1]] to i32
 ; CHECK-NEXT:    [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
 ; CHECK-NEXT:    ret i32 [[OR]]
 ;
-  %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
-  %p.0 = load i16, i16 addrspace(5)* %p, align 2
-  %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2
+  %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
+  %p.0 = load i16, ptr addrspace(5) %p, align 2
+  %p.1 = load i16, ptr addrspace(5) %gep.p, align 2
   %zext.0 = zext i16 %p.0 to i32
   %zext.1 = zext i16 %p.1 to i32
   %shl.1 = shl i32 %zext.1, 16
@@ -348,11 +328,11 @@ define i32 @private_load_2xi16_align2_not_alloca(i16 addrspace(5)* %p) #0 {
   ret i32 %or
 }
 
-define i32 @private_load_2xi16_align1_not_alloca(i16 addrspace(5)* %p) #0 {
+define i32 @private_load_2xi16_align1_not_alloca(ptr addrspace(5) %p) #0 {
 ; ALIGNED-LABEL: @private_load_2xi16_align1_not_alloca(
-; ALIGNED-NEXT:    [[GEP_P:%.*]] = getelementptr i16, i16 addrspace(5)* [[P:%.*]], i64 1
-; ALIGNED-NEXT:    [[P_0:%.*]] = load i16, i16 addrspace(5)* [[P]], align 1
-; ALIGNED-NEXT:    [[P_1:%.*]] = load i16, i16 addrspace(5)* [[GEP_P]], align 1
+; ALIGNED-NEXT:    [[GEP_P:%.*]] = getelementptr i16, ptr addrspace(5) [[P:%.*]], i64 1
+; ALIGNED-NEXT:    [[P_0:%.*]] = load i16, ptr addrspace(5) [[P]], align 1
+; ALIGNED-NEXT:    [[P_1:%.*]] = load i16, ptr addrspace(5) [[GEP_P]], align 1
 ; ALIGNED-NEXT:    [[ZEXT_0:%.*]] = zext i16 [[P_0]] to i32
 ; ALIGNED-NEXT:    [[ZEXT_1:%.*]] = zext i16 [[P_1]] to i32
 ; ALIGNED-NEXT:    [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
@@ -360,8 +340,7 @@ define i32 @private_load_2xi16_align1_not_alloca(i16 addrspace(5)* %p) #0 {
 ; ALIGNED-NEXT:    ret i32 [[OR]]
 ;
 ; UNALIGNED-LABEL: @private_load_2xi16_align1_not_alloca(
-; UNALIGNED-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(5)* [[P:%.*]] to <2 x i16> addrspace(5)*
-; UNALIGNED-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[P:%.*]], align 1
 ; UNALIGNED-NEXT:    [[P_01:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
 ; UNALIGNED-NEXT:    [[P_12:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
 ; UNALIGNED-NEXT:    [[ZEXT_0:%.*]] = zext i16 [[P_01]] to i32
@@ -370,9 +349,9 @@ define i32 @private_load_2xi16_align1_not_alloca(i16 addrspace(5)* %p) #0 {
 ; UNALIGNED-NEXT:    [[OR:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
 ; UNALIGNED-NEXT:    ret i32 [[OR]]
 ;
-  %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
-  %p.0 = load i16, i16 addrspace(5)* %p, align 1
-  %p.1 = load i16, i16 addrspace(5)* %gep.p, align 1
+  %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
+  %p.0 = load i16, ptr addrspace(5) %p, align 1
+  %p.1 = load i16, ptr addrspace(5) %gep.p, align 1
   %zext.0 = zext i16 %p.0 to i32
   %zext.1 = zext i16 %p.1 to i32
   %shl.1 = shl i32 %zext.1, 16

diff  --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
index aa9a6846a5e2..4c631e178cbc 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
@@ -17,42 +17,42 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 ; ELT8: store <2 x i32>
 
 ; ELT16-UNALIGNED: store <4 x i32>
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
-
-  store i32 9, i32 addrspace(5)* %out
-  store i32 1, i32 addrspace(5)* %out.gep.1
-  store i32 23, i32 addrspace(5)* %out.gep.2
-  store i32 19, i32 addrspace(5)* %out.gep.3
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(5) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(5) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(5) %out, i32 3
+
+  store i32 9, ptr addrspace(5) %out
+  store i32 1, ptr addrspace(5) %out.gep.1
+  store i32 23, ptr addrspace(5) %out.gep.2
+  store i32 19, ptr addrspace(5) %out.gep.3
   ret void
 }
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1(
-; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 1
-; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 1
-; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 1
-; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 1
+; ALIGNED: store i32 9, ptr addrspace(5) %out, align 1
+; ALIGNED: store i32 1, ptr addrspace(5) %out.gep.1, align 1
+; ALIGNED: store i32 23, ptr addrspace(5) %out.gep.2, align 1
+; ALIGNED: store i32 19, ptr addrspace(5) %out.gep.3, align 1
 
-; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1
+; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, ptr addrspace(5) %out, align 1
 
-; ELT8-UNALIGNED: store <2 x i32> <i32 9, i32 1>, <2 x i32> addrspace(5)* %1, align 1
-; ELT8-UNALIGNED: store <2 x i32> <i32 23, i32 19>, <2 x i32> addrspace(5)* %2, align 1
+; ELT8-UNALIGNED: store <2 x i32> <i32 9, i32 1>, ptr addrspace(5) %out, align 1
+; ELT8-UNALIGNED: store <2 x i32> <i32 23, i32 19>, ptr addrspace(5) %out.gep.2, align 1
 
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
-
-  store i32 9, i32 addrspace(5)* %out, align 1
-  store i32 1, i32 addrspace(5)* %out.gep.1, align 1
-  store i32 23, i32 addrspace(5)* %out.gep.2, align 1
-  store i32 19, i32 addrspace(5)* %out.gep.3, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(5) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(5) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(5) %out, i32 3
+
+  store i32 9, ptr addrspace(5) %out, align 1
+  store i32 1, ptr addrspace(5) %out.gep.1, align 1
+  store i32 23, ptr addrspace(5) %out.gep.2, align 1
+  store i32 19, ptr addrspace(5) %out.gep.3, align 1
   ret void
 }
 
@@ -61,29 +61,29 @@ define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(
 ; ALL: store i32
 ; ALL: store i32
 ; ALL: store i32
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
-
-  store i32 9, i32 addrspace(5)* %out, align 2
-  store i32 1, i32 addrspace(5)* %out.gep.1, align 2
-  store i32 23, i32 addrspace(5)* %out.gep.2, align 2
-  store i32 19, i32 addrspace(5)* %out.gep.3, align 2
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(5) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(5) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(5) %out, i32 3
+
+  store i32 9, ptr addrspace(5) %out, align 2
+  store i32 1, ptr addrspace(5) %out.gep.1, align 2
+  store i32 23, ptr addrspace(5) %out.gep.2, align 2
+  store i32 19, ptr addrspace(5) %out.gep.3, align 2
   ret void
 }
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
 ; ALL: store <4 x i8>
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3
-
-  store i8 9, i8 addrspace(5)* %out, align 4
-  store i8 1, i8 addrspace(5)* %out.gep.1
-  store i8 23, i8 addrspace(5)* %out.gep.2
-  store i8 19, i8 addrspace(5)* %out.gep.3
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i8, ptr addrspace(5) %out, i32 1
+  %out.gep.2 = getelementptr i8, ptr addrspace(5) %out, i32 2
+  %out.gep.3 = getelementptr i8, ptr addrspace(5) %out, i32 3
+
+  store i8 9, ptr addrspace(5) %out, align 4
+  store i8 1, ptr addrspace(5) %out.gep.1
+  store i8 23, ptr addrspace(5) %out.gep.2
+  store i8 19, ptr addrspace(5) %out.gep.3
   ret void
 }
 
@@ -93,37 +93,37 @@ define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8 addrs
 ; ALIGNED: store i8
 ; ALIGNED: store i8
 
-; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3
+; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, ptr addrspace(5) %out, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i8, ptr addrspace(5) %out, i32 1
+  %out.gep.2 = getelementptr i8, ptr addrspace(5) %out, i32 2
+  %out.gep.3 = getelementptr i8, ptr addrspace(5) %out, i32 3
 
-  store i8 9, i8 addrspace(5)* %out, align 1
-  store i8 1, i8 addrspace(5)* %out.gep.1, align 1
-  store i8 23, i8 addrspace(5)* %out.gep.2, align 1
-  store i8 19, i8 addrspace(5)* %out.gep.3, align 1
+  store i8 9, ptr addrspace(5) %out, align 1
+  store i8 1, ptr addrspace(5) %out.gep.1, align 1
+  store i8 23, ptr addrspace(5) %out.gep.2, align 1
+  store i8 19, ptr addrspace(5) %out.gep.3, align 1
   ret void
 }
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16(
 ; ALL: store <2 x i16>
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i16, ptr addrspace(5) %out, i32 1
 
-  store i16 9, i16 addrspace(5)* %out, align 4
-  store i16 12, i16 addrspace(5)* %out.gep.1
+  store i16 9, ptr addrspace(5) %out, align 4
+  store i16 12, ptr addrspace(5) %out.gep.1
   ret void
 }
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2(
 ; ALL: store i16
 ; ALL: store i16
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i16, ptr addrspace(5) %out, i32 1
 
-  store i16 9, i16 addrspace(5)* %out, align 2
-  store i16 12, i16 addrspace(5)* %out.gep.1, align 2
+  store i16 9, ptr addrspace(5) %out, align 2
+  store i16 12, ptr addrspace(5) %out.gep.1, align 2
   ret void
 }
 
@@ -131,22 +131,22 @@ define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(
 ; ALIGNED: store i16
 ; ALIGNED: store i16
 
-; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 1
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
+; UNALIGNED: store <2 x i16> <i16 9, i16 12>, ptr addrspace(5) %out, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i16, ptr addrspace(5) %out, i32 1
 
-  store i16 9, i16 addrspace(5)* %out, align 1
-  store i16 12, i16 addrspace(5)* %out.gep.1, align 1
+  store i16 9, ptr addrspace(5) %out, align 1
+  store i16 12, ptr addrspace(5) %out.gep.1, align 1
   ret void
 }
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8(
-; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 8
-define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1
+; ALL: store <2 x i16> <i16 9, i16 12>, ptr addrspace(5) %out, align 8
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i16, ptr addrspace(5) %out, i32 1
 
-  store i16 9, i16 addrspace(5)* %out, align 8
-  store i16 12, i16 addrspace(5)* %out.gep.1, align 2
+  store i16 9, ptr addrspace(5) %out, align 8
+  store i16 12, ptr addrspace(5) %out.gep.1, align 2
   ret void
 }
 
@@ -159,13 +159,13 @@ define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(
 ; ELT8: store i32
 
 ; ELT16: store <3 x i32>
-define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(5) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(5) %out, i32 2
 
-  store i32 9, i32 addrspace(5)* %out
-  store i32 1, i32 addrspace(5)* %out.gep.1
-  store i32 23, i32 addrspace(5)* %out.gep.2
+  store i32 9, ptr addrspace(5) %out
+  store i32 1, ptr addrspace(5) %out.gep.1
+  store i32 23, ptr addrspace(5) %out.gep.2
   ret void
 }
 
@@ -182,13 +182,13 @@ define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32 add
 ; ELT8-UNALIGNED: store i32
 
 ; ELT16-UNALIGNED: store <3 x i32>
-define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(5) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(5) %out, i32 2
 
-  store i32 9, i32 addrspace(5)* %out, align 1
-  store i32 1, i32 addrspace(5)* %out.gep.1, align 1
-  store i32 23, i32 addrspace(5)* %out.gep.2, align 1
+  store i32 9, ptr addrspace(5) %out, align 1
+  store i32 1, ptr addrspace(5) %out.gep.1, align 1
+  store i32 23, ptr addrspace(5) %out.gep.2, align 1
   ret void
 }
 
@@ -198,13 +198,13 @@ define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(
 ; ALIGNED: store i8
 
 ; UNALIGNED: store <3 x i8>
-define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(ptr addrspace(5) %out) #0 {
+  %out.gep.1 = getelementptr i8, ptr addrspace(5) %out, i8 1
+  %out.gep.2 = getelementptr i8, ptr addrspace(5) %out, i8 2
 
-  store i8 9, i8 addrspace(5)* %out, align 1
-  store i8 1, i8 addrspace(5)* %out.gep.1, align 1
-  store i8 23, i8 addrspace(5)* %out.gep.2, align 1
+  store i8 9, ptr addrspace(5) %out, align 1
+  store i8 1, ptr addrspace(5) %out.gep.1, align 1
+  store i8 23, ptr addrspace(5) %out.gep.2, align 1
   ret void
 }
 

diff  --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
index 58964db6a1f8..38cb6c9bc3ed 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
@@ -9,643 +9,592 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 ; TODO: Same base addrspacecasted
 
 
-define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[OUT:%.*]] to <2 x i8> addrspace(1)*
-; CHECK-NEXT:    store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* [[TMP1]], align 2
+; CHECK-NEXT:    store <2 x i8> <i8 -56, i8 123>, ptr addrspace(1) [[OUT:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
 
-  store i8 123, i8 addrspace(1)* %out.gep.1
-  store i8 456, i8 addrspace(1)* %out, align 2
+  store i8 123, ptr addrspace(1) %out.gep.1
+  store i8 456, ptr addrspace(1) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[OUT:%.*]] to <2 x i8> addrspace(1)*
-; CHECK-NEXT:    store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* [[TMP1]], align 1
+; CHECK-NEXT:    store <2 x i8> <i8 -56, i8 123>, ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
 
-  store i8 123, i8 addrspace(1)* %out.gep.1
-  store i8 456, i8 addrspace(1)* %out
+  store i8 123, ptr addrspace(1) %out.gep.1
+  store i8 456, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(1)* [[OUT:%.*]] to <2 x i16> addrspace(1)*
-; CHECK-NEXT:    store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT:    store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
 
-  store i16 123, i16 addrspace(1)* %out.gep.1
-  store i16 456, i16 addrspace(1)* %out, align 4
+  store i16 123, ptr addrspace(1) %out.gep.1
+  store i16 456, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_0_i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(1)* [[OUT:%.*]] to <2 x i16> addrspace(1)*
-; CHECK-NEXT:    store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT:    store <2 x i16> zeroinitializer, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
 
-  store i16 0, i16 addrspace(1)* %out.gep.1
-  store i16 0, i16 addrspace(1)* %out, align 4
+  store i16 0, ptr addrspace(1) %out.gep.1
+  store i16 0, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align(
-; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr i16, i16 addrspace(1)* [[OUT:%.*]], i32 1
-; CHECK-NEXT:    store i16 123, i16 addrspace(1)* [[OUT_GEP_1]], align 2
-; CHECK-NEXT:    store i16 456, i16 addrspace(1)* [[OUT]], align 2
+; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT:%.*]], i32 1
+; CHECK-NEXT:    store i16 123, ptr addrspace(1) [[OUT_GEP_1]], align 2
+; CHECK-NEXT:    store i16 456, ptr addrspace(1) [[OUT]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
 
-  store i16 123, i16 addrspace(1)* %out.gep.1
-  store i16 456, i16 addrspace(1)* %out
+  store i16 123, ptr addrspace(1) %out.gep.1
+  store i16 456, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_constants_i16_align_1(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16_align_1(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_i16_align_1(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 addrspace(1)* [[OUT:%.*]] to <2 x i16> addrspace(1)*
-; CHECK-NEXT:    store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* [[TMP1]], align 1
+; CHECK-NEXT:    store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
 
-  store i16 123, i16 addrspace(1)* %out.gep.1, align 1
-  store i16 456, i16 addrspace(1)* %out, align 1
+  store i16 123, ptr addrspace(1) %out.gep.1, align 1
+  store i16 456, ptr addrspace(1) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align(
-; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr half, half addrspace(1)* [[OUT:%.*]], i32 1
-; CHECK-NEXT:    store half 0xH4000, half addrspace(1)* [[OUT_GEP_1]], align 2
-; CHECK-NEXT:    store half 0xH3C00, half addrspace(1)* [[OUT]], align 2
+; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr half, ptr addrspace(1) [[OUT:%.*]], i32 1
+; CHECK-NEXT:    store half 0xH4000, ptr addrspace(1) [[OUT_GEP_1]], align 2
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
+  %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
 
-  store half 2.0, half addrspace(1)* %out.gep.1
-  store half 1.0, half addrspace(1)* %out
+  store half 2.0, ptr addrspace(1) %out.gep.1
+  store half 1.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_constants_half_align_1(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_half_align_1(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_half_align_1(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half addrspace(1)* [[OUT:%.*]] to <2 x half> addrspace(1)*
-; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> addrspace(1)* [[TMP1]], align 1
+; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4000>, ptr addrspace(1) [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
+  %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
 
-  store half 2.0, half addrspace(1)* %out.gep.1, align 1
-  store half 1.0, half addrspace(1)* %out, align 1
+  store half 2.0, ptr addrspace(1) %out.gep.1, align 1
+  store half 1.0, ptr addrspace(1) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <2 x i32> addrspace(1)*
-; CHECK-NEXT:    store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT:    store <2 x i32> <i32 456, i32 123>, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
 
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out
+  store i32 123, ptr addrspace(1) %out.gep.1
+  store i32 456, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_i32_f32(
-; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 1
-; CHECK-NEXT:    [[OUT_GEP_1_BC:%.*]] = bitcast i32 addrspace(1)* [[OUT_GEP_1]] to float addrspace(1)*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT]] to <2 x i32> addrspace(1)*
-; CHECK-NEXT:    store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT:    store <2 x i32> <i32 456, i32 1065353216>, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
-  store float 1.0, float addrspace(1)* %out.gep.1.bc
-  store i32 456, i32 addrspace(1)* %out
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  store float 1.0, ptr addrspace(1) %out.gep.1
+  store i32 456, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32(
-; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 1
-; CHECK-NEXT:    [[OUT_GEP_1_BC:%.*]] = bitcast float addrspace(1)* [[OUT_GEP_1]] to i32 addrspace(1)*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float addrspace(1)* [[OUT]] to <2 x i32> addrspace(1)*
-; CHECK-NEXT:    store <2 x i32> <i32 1082130432, i32 123>, <2 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT:    store <2 x i32> <i32 1082130432, i32 123>, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
-  store i32 123, i32 addrspace(1)* %out.gep.1.bc
-  store float 4.0, float addrspace(1)* %out
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  store i32 123, ptr addrspace(1) %out.gep.1
+  store float 4.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i32(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_4_constants_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT:    store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
 
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out.gep.2
-  store i32 333, i32 addrspace(1)* %out.gep.3
-  store i32 1234, i32 addrspace(1)* %out
+  store i32 123, ptr addrspace(1) %out.gep.1
+  store i32 456, ptr addrspace(1) %out.gep.2
+  store i32 333, ptr addrspace(1) %out.gep.3
+  store i32 1234, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_4_constants_f32_order(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float addrspace(1)* [[OUT:%.*]] to <4 x float> addrspace(1)*
-; CHECK-NEXT:    store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT:    store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
 
-  store float 8.0, float addrspace(1)* %out
-  store float 1.0, float addrspace(1)* %out.gep.1
-  store float 2.0, float addrspace(1)* %out.gep.2
-  store float 4.0, float addrspace(1)* %out.gep.3
+  store float 8.0, ptr addrspace(1) %out
+  store float 1.0, ptr addrspace(1) %out.gep.1
+  store float 2.0, ptr addrspace(1) %out.gep.2
+  store float 4.0, ptr addrspace(1) %out.gep.3
   ret void
 }
 
 ; First store is out of order.
-define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_4_constants_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float addrspace(1)* [[OUT:%.*]] to <4 x float> addrspace(1)*
-; CHECK-NEXT:    store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT:    store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
 
-  store float 1.0, float addrspace(1)* %out.gep.1
-  store float 2.0, float addrspace(1)* %out.gep.2
-  store float 4.0, float addrspace(1)* %out.gep.3
-  store float 8.0, float addrspace(1)* %out
+  store float 1.0, ptr addrspace(1) %out.gep.1
+  store float 2.0, ptr addrspace(1) %out.gep.2
+  store float 4.0, ptr addrspace(1) %out.gep.3
+  store float 8.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32(
-; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 1
-; CHECK-NEXT:    [[OUT_GEP_3:%.*]] = getelementptr float, float addrspace(1)* [[OUT]], i32 3
-; CHECK-NEXT:    [[OUT_GEP_1_BC:%.*]] = bitcast float addrspace(1)* [[OUT_GEP_1]] to i32 addrspace(1)*
-; CHECK-NEXT:    [[OUT_GEP_3_BC:%.*]] = bitcast float addrspace(1)* [[OUT_GEP_3]] to i32 addrspace(1)*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float addrspace(1)* [[OUT]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT:    store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
 
-  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
-  %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
 
-  store i32 11, i32 addrspace(1)* %out.gep.1.bc
-  store float 2.0, float addrspace(1)* %out.gep.2
-  store i32 17, i32 addrspace(1)* %out.gep.3.bc
-  store float 8.0, float addrspace(1)* %out
+  store i32 11, ptr addrspace(1) %out.gep.1
+  store float 2.0, ptr addrspace(1) %out.gep.2
+  store i32 17, ptr addrspace(1) %out.gep.3
+  store float 8.0, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_3_constants_i32(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_3_constants_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <3 x i32> addrspace(1)*
-; CHECK-NEXT:    store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT:    store <3 x i32> <i32 1234, i32 123, i32 456>, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
 
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out.gep.2
-  store i32 1234, i32 addrspace(1)* %out
+  store i32 123, ptr addrspace(1) %out.gep.1
+  store i32 456, ptr addrspace(1) %out.gep.2
+  store i32 1234, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i64(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_2_constants_i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 addrspace(1)* [[OUT:%.*]] to <2 x i64> addrspace(1)*
-; CHECK-NEXT:    store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* [[TMP1]], align 8
+; CHECK-NEXT:    store <2 x i64> <i64 456, i64 123>, ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
+  %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
 
-  store i64 123, i64 addrspace(1)* %out.gep.1
-  store i64 456, i64 addrspace(1)* %out
+  store i64 123, ptr addrspace(1) %out.gep.1
+  store i64 456, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i64(ptr addrspace(1) %out) #0 {
 ; CHECK-LABEL: @merge_global_store_4_constants_i64(
-; CHECK-NEXT:    [[OUT_GEP_2:%.*]] = getelementptr i64, i64 addrspace(1)* [[OUT:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 addrspace(1)* [[OUT_GEP_2]] to <2 x i64> addrspace(1)*
-; CHECK-NEXT:    store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 addrspace(1)* [[OUT]] to <2 x i64> addrspace(1)*
-; CHECK-NEXT:    store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* [[TMP2]], align 8
+; CHECK-NEXT:    [[OUT_GEP_2:%.*]] = getelementptr i64, ptr addrspace(1) [[OUT:%.*]], i64 2
+; CHECK-NEXT:    store <2 x i64> <i64 456, i64 333>, ptr addrspace(1) [[OUT_GEP_2]], align 8
+; CHECK-NEXT:    store <2 x i64> <i64 1234, i64 123>, ptr addrspace(1) [[OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
-  %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
-  %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
+  %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
+  %out.gep.2 = getelementptr i64, ptr addrspace(1) %out, i64 2
+  %out.gep.3 = getelementptr i64, ptr addrspace(1) %out, i64 3
 
-  store i64 123, i64 addrspace(1)* %out.gep.1
-  store i64 456, i64 addrspace(1)* %out.gep.2
-  store i64 333, i64 addrspace(1)* %out.gep.3
-  store i64 1234, i64 addrspace(1)* %out
+  store i64 123, ptr addrspace(1) %out.gep.1
+  store i64 456, ptr addrspace(1) %out.gep.2
+  store i64 333, ptr addrspace(1) %out.gep.3
+  store i64 1234, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <2 x i32> addrspace(1)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[LO1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[HI2:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[HI2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <2 x i32> addrspace(1)*
-; CHECK-NEXT:    store <2 x i32> [[TMP4]], <2 x i32> addrspace(1)* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[HI2]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
 
-  %lo = load i32, i32 addrspace(1)* %in
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
+  %lo = load i32, ptr addrspace(1) %in
+  %hi = load i32, ptr addrspace(1) %in.gep.1
 
-  store i32 %lo, i32 addrspace(1)* %out
-  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  store i32 %lo, ptr addrspace(1) %out
+  store i32 %hi, ptr addrspace(1) %out.gep.1
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base(
-; CHECK-NEXT:    [[IN_GEP_0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i32 2
-; CHECK-NEXT:    [[OUT_GEP_0:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN_GEP_0]] to <2 x i32> addrspace(1)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[LO1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[HI2:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[HI2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[OUT_GEP_0]] to <2 x i32> addrspace(1)*
-; CHECK-NEXT:    store <2 x i32> [[TMP4]], <2 x i32> addrspace(1)* [[TMP5]], align 4
+; CHECK-NEXT:    [[IN_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 2
+; CHECK-NEXT:    [[OUT_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN_GEP_0]], align 4
+; CHECK-NEXT:    [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[HI2]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT_GEP_0]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+  %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 2
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 3
 
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %lo = load i32, i32 addrspace(1)* %in.gep.0
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
+  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 3
+  %lo = load i32, ptr addrspace(1) %in.gep.0
+  %hi = load i32, ptr addrspace(1) %in.gep.1
 
-  store i32 %lo, i32 addrspace(1)* %out.gep.0
-  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  store i32 %lo, ptr addrspace(1) %out.gep.0
+  store i32 %hi, ptr addrspace(1) %out.gep.1
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <2 x i32> addrspace(1)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[LO1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[HI2:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[HI2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LO1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <2 x i32> addrspace(1)*
-; CHECK-NEXT:    store <2 x i32> [[TMP4]], <2 x i32> addrspace(1)* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    [[LO1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[HI2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[HI2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LO1]], i32 1
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
 
-  %lo = load i32, i32 addrspace(1)* %in
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
+  %lo = load i32, ptr addrspace(1) %in
+  %hi = load i32, ptr addrspace(1) %in.gep.1
 
-  store i32 %hi, i32 addrspace(1)* %out
-  store i32 %lo, i32 addrspace(1)* %out.gep.1
+  store i32 %hi, ptr addrspace(1) %out
+  store i32 %lo, ptr addrspace(1) %out.gep.1
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Y2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[Z3]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[W4]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32> addrspace(1)* [[TMP7]], align 4
-; CHECK-NEXT:    ret void
-;
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
-
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  ret void
-}
-
-define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
+  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
+
+  %x = load i32, ptr addrspace(1) %in
+  %y = load i32, ptr addrspace(1) %in.gep.1
+  %z = load i32, ptr addrspace(1) %in.gep.2
+  %w = load i32, ptr addrspace(1) %in.gep.3
+
+  store i32 %x, ptr addrspace(1) %out
+  store i32 %y, ptr addrspace(1) %out.gep.1
+  store i32 %z, ptr addrspace(1) %out.gep.2
+  store i32 %w, ptr addrspace(1) %out.gep.3
+  ret void
+}
+
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <3 x i32> addrspace(1)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <3 x i32>, <3 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <3 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <3 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[Z3:%.*]] = extractelement <3 x i32> [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> poison, i32 [[X1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[Y2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x i32> [[TMP4]], i32 [[Z3]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <3 x i32> addrspace(1)*
-; CHECK-NEXT:    store <3 x i32> [[TMP5]], <3 x i32> addrspace(1)* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <3 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[Z3:%.*]] = extractelement <3 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x i32> poison, i32 [[X1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP2]], i32 [[Y2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[Z3]], i32 2
+; CHECK-NEXT:    store <3 x i32> [[TMP4]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
 
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %x = load i32, ptr addrspace(1) %in
+  %y = load i32, ptr addrspace(1) %in.gep.1
+  %z = load i32, ptr addrspace(1) %in.gep.2
 
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %x, ptr addrspace(1) %out
+  store i32 %y, ptr addrspace(1) %out.gep.1
+  store i32 %z, ptr addrspace(1) %out.gep.2
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to <4 x float> addrspace(1)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[X1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[Y2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[Z3]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[W4]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float addrspace(1)* [[OUT:%.*]] to <4 x float> addrspace(1)*
-; CHECK-NEXT:    store <4 x float> [[TMP6]], <4 x float> addrspace(1)* [[TMP7]], align 4
-; CHECK-NEXT:    ret void
-;
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
-
-  %x = load float, float addrspace(1)* %in
-  %y = load float, float addrspace(1)* %in.gep.1
-  %z = load float, float addrspace(1)* %in.gep.2
-  %w = load float, float addrspace(1)* %in.gep.3
-
-  store float %x, float addrspace(1)* %out
-  store float %y, float addrspace(1)* %out.gep.1
-  store float %z, float addrspace(1)* %out.gep.2
-  store float %w, float addrspace(1)* %out.gep.3
-  ret void
-}
-
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> poison, float [[X1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[Y2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[Z3]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[W4]], i32 3
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
+  %in.gep.1 = getelementptr float, ptr addrspace(1) %in, i32 1
+  %in.gep.2 = getelementptr float, ptr addrspace(1) %in, i32 2
+  %in.gep.3 = getelementptr float, ptr addrspace(1) %in, i32 3
+
+  %x = load float, ptr addrspace(1) %in
+  %y = load float, ptr addrspace(1) %in.gep.1
+  %z = load float, ptr addrspace(1) %in.gep.2
+  %w = load float, ptr addrspace(1) %in.gep.3
+
+  store float %x, ptr addrspace(1) %out
+  store float %y, ptr addrspace(1) %out.gep.1
+  store float %z, ptr addrspace(1) %out.gep.2
+  store float %w, ptr addrspace(1) %out.gep.3
+  ret void
+}
+
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base(
-; CHECK-NEXT:    [[IN_GEP_0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i32 11
-; CHECK-NEXT:    [[OUT_GEP_0:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN_GEP_0]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Y2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[Z3]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[W4]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[OUT_GEP_0]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32> addrspace(1)* [[TMP7]], align 4
-; CHECK-NEXT:    ret void
-;
-  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
-
-  %x = load i32, i32 addrspace(1)* %in.gep.0
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
-
-  store i32 %x, i32 addrspace(1)* %out.gep.0
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  ret void
-}
-
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK-NEXT:    [[IN_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 11
+; CHECK-NEXT:    [[OUT_GEP_0:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i32 7
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN_GEP_0]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT_GEP_0]], align 4
+; CHECK-NEXT:    ret void
+;
+  %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 11
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 12
+  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 13
+  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 14
+  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 7
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 8
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 9
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 10
+
+  %x = load i32, ptr addrspace(1) %in.gep.0
+  %y = load i32, ptr addrspace(1) %in.gep.1
+  %z = load i32, ptr addrspace(1) %in.gep.2
+  %w = load i32, ptr addrspace(1) %in.gep.3
+
+  store i32 %x, ptr addrspace(1) %out.gep.0
+  store i32 %y, ptr addrspace(1) %out.gep.1
+  store i32 %z, ptr addrspace(1) %out.gep.2
+  store i32 %w, ptr addrspace(1) %out.gep.3
+  ret void
+}
+
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier() #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Y2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[Z3]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[W4]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32> addrspace(1)* [[TMP7]], align 4
-; CHECK-NEXT:    ret void
-;
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Y2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[W4]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
+  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
+
+  %x = load i32, ptr addrspace(1) %in
+  %y = load i32, ptr addrspace(1) %in.gep.1
+  %z = load i32, ptr addrspace(1) %in.gep.2
+  %w = load i32, ptr addrspace(1) %in.gep.3
 
   ; Make sure the barrier doesn't stop this
   tail call void @llvm.amdgcn.s.barrier() #1
 
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %x, i32 addrspace(1)* %out
+  store i32 %w, ptr addrspace(1) %out.gep.3
+  store i32 %z, ptr addrspace(1) %out.gep.2
+  store i32 %y, ptr addrspace(1) %out.gep.1
+  store i32 %x, ptr addrspace(1) %out
 
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier() #[[ATTR3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[W4]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[Y2]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[X1]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32> addrspace(1)* [[TMP7]], align 4
-; CHECK-NEXT:    ret void
-;
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[W4]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Z3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Y2]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[X1]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
+  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
+
+  %x = load i32, ptr addrspace(1) %in
+  %y = load i32, ptr addrspace(1) %in.gep.1
+  %z = load i32, ptr addrspace(1) %in.gep.2
+  %w = load i32, ptr addrspace(1) %in.gep.3
 
   ; Make sure the barrier doesn't stop this
   tail call void @llvm.amdgcn.s.barrier() #1
 
-  store i32 %w, i32 addrspace(1)* %out
-  store i32 %z, i32 addrspace(1)* %out.gep.1
-  store i32 %y, i32 addrspace(1)* %out.gep.2
-  store i32 %x, i32 addrspace(1)* %out.gep.3
+  store i32 %w, ptr addrspace(1) %out
+  store i32 %z, ptr addrspace(1) %out.gep.1
+  store i32 %y, ptr addrspace(1) %out.gep.2
+  store i32 %x, ptr addrspace(1) %out.gep.3
 
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[IN:%.*]] to <4 x i8> addrspace(1)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
-; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
-; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Y2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[Z3]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[W4]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 addrspace(1)* [[OUT:%.*]] to <4 x i8> addrspace(1)*
-; CHECK-NEXT:    store <4 x i8> [[TMP6]], <4 x i8> addrspace(1)* [[TMP7]], align 4
-; CHECK-NEXT:    ret void
-;
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
-  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
-  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
-
-  %x = load i8, i8 addrspace(1)* %in, align 4
-  %y = load i8, i8 addrspace(1)* %in.gep.1
-  %z = load i8, i8 addrspace(1)* %in.gep.2
-  %w = load i8, i8 addrspace(1)* %in.gep.3
-
-  store i8 %x, i8 addrspace(1)* %out, align 4
-  store i8 %y, i8 addrspace(1)* %out.gep.1
-  store i8 %z, i8 addrspace(1)* %out.gep.2
-  store i8 %w, i8 addrspace(1)* %out.gep.3
-  ret void
-}
-
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
+; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[Y2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Z3]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[W4]], i32 3
+; CHECK-NEXT:    store <4 x i8> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
+  %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
+  %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
+  %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
+  %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
+
+  %x = load i8, ptr addrspace(1) %in, align 4
+  %y = load i8, ptr addrspace(1) %in.gep.1
+  %z = load i8, ptr addrspace(1) %in.gep.2
+  %w = load i8, ptr addrspace(1) %in.gep.3
+
+  store i8 %x, ptr addrspace(1) %out, align 4
+  store i8 %y, ptr addrspace(1) %out.gep.1
+  store i8 %z, ptr addrspace(1) %out.gep.2
+  store i8 %w, ptr addrspace(1) %out.gep.3
+  ret void
+}
+
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[IN:%.*]] to <4 x i8> addrspace(1)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8> addrspace(1)* [[TMP1]], align 1
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
-; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
-; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Y2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[Z3]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[W4]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 addrspace(1)* [[OUT:%.*]] to <4 x i8> addrspace(1)*
-; CHECK-NEXT:    store <4 x i8> [[TMP6]], <4 x i8> addrspace(1)* [[TMP7]], align 1
-; CHECK-NEXT:    ret void
-;
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
-  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
-  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
-
-  %x = load i8, i8 addrspace(1)* %in
-  %y = load i8, i8 addrspace(1)* %in.gep.1
-  %z = load i8, i8 addrspace(1)* %in.gep.2
-  %w = load i8, i8 addrspace(1)* %in.gep.3
-
-  store i8 %x, i8 addrspace(1)* %out
-  store i8 %y, i8 addrspace(1)* %out.gep.1
-  store i8 %z, i8 addrspace(1)* %out.gep.2
-  store i8 %w, i8 addrspace(1)* %out.gep.3
-  ret void
-}
-
-define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr addrspace(1) [[IN:%.*]], align 1
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    [[Z3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
+; CHECK-NEXT:    [[W4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[Y2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Z3]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[W4]], i32 3
+; CHECK-NEXT:    store <4 x i8> [[TMP5]], ptr addrspace(1) [[OUT:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
+  %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
+  %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
+  %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
+  %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
+
+  %x = load i8, ptr addrspace(1) %in
+  %y = load i8, ptr addrspace(1) %in.gep.1
+  %z = load i8, ptr addrspace(1) %in.gep.2
+  %w = load i8, ptr addrspace(1) %in.gep.3
+
+  store i8 %x, ptr addrspace(1) %out
+  store i8 %y, ptr addrspace(1) %out.gep.1
+  store i8 %z, ptr addrspace(1) %out.gep.2
+  store i8 %w, ptr addrspace(1) %out.gep.3
+  ret void
+}
+
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32(
-; CHECK-NEXT:    [[VEC:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC:%.*]] = load <4 x i32>, ptr addrspace(1) [[IN:%.*]], align 16
 ; CHECK-NEXT:    [[X:%.*]] = extractelement <4 x i32> [[VEC]], i32 0
 ; CHECK-NEXT:    [[Y:%.*]] = extractelement <4 x i32> [[VEC]], i32 1
 ; CHECK-NEXT:    [[Z:%.*]] = extractelement <4 x i32> [[VEC]], i32 2
@@ -654,249 +603,237 @@ define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addr
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Y]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Z]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[W]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
+  %vec = load <4 x i32>, ptr addrspace(1) %in
 
   %x = extractelement <4 x i32> %vec, i32 0
   %y = extractelement <4 x i32> %vec, i32 1
   %z = extractelement <4 x i32> %vec, i32 2
   %w = extractelement <4 x i32> %vec, i32 3
 
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
+  store i32 %x, ptr addrspace(1) %out
+  store i32 %y, ptr addrspace(1) %out.gep.1
+  store i32 %z, ptr addrspace(1) %out.gep.2
+  store i32 %w, ptr addrspace(1) %out.gep.3
   ret void
 }
 
-define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i8(ptr addrspace(3) %out) #0 {
 ; CHECK-LABEL: @merge_local_store_2_constants_i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[OUT:%.*]] to <2 x i8> addrspace(3)*
-; CHECK-NEXT:    store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* [[TMP1]], align 2
+; CHECK-NEXT:    store <2 x i8> <i8 -56, i8 123>, ptr addrspace(3) [[OUT:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
+  %out.gep.1 = getelementptr i8, ptr addrspace(3) %out, i32 1
 
-  store i8 123, i8 addrspace(3)* %out.gep.1
-  store i8 456, i8 addrspace(3)* %out, align 2
+  store i8 123, ptr addrspace(3) %out.gep.1
+  store i8 456, ptr addrspace(3) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32(ptr addrspace(3) %out) #0 {
 ; CHECK-LABEL: @merge_local_store_2_constants_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(3)* [[OUT:%.*]] to <2 x i32> addrspace(3)*
-; CHECK-NEXT:    store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* [[TMP1]], align 4
+; CHECK-NEXT:    store <2 x i32> <i32 456, i32 123>, ptr addrspace(3) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+  %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
 
-  store i32 123, i32 addrspace(3)* %out.gep.1
-  store i32 456, i32 addrspace(3)* %out
+  store i32 123, ptr addrspace(3) %out.gep.1
+  store i32 456, ptr addrspace(3) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(ptr addrspace(3) %out) #0 {
 ; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2(
-; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr i32, i32 addrspace(3)* [[OUT:%.*]], i32 1
-; CHECK-NEXT:    store i32 123, i32 addrspace(3)* [[OUT_GEP_1]], align 2
-; CHECK-NEXT:    store i32 456, i32 addrspace(3)* [[OUT]], align 2
+; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr i32, ptr addrspace(3) [[OUT:%.*]], i32 1
+; CHECK-NEXT:    store i32 123, ptr addrspace(3) [[OUT_GEP_1]], align 2
+; CHECK-NEXT:    store i32 456, ptr addrspace(3) [[OUT]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+  %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
 
-  store i32 123, i32 addrspace(3)* %out.gep.1, align 2
-  store i32 456, i32 addrspace(3)* %out, align 2
+  store i32 123, ptr addrspace(3) %out.gep.1, align 2
+  store i32 456, ptr addrspace(3) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_4_constants_i32(ptr addrspace(3) %out) #0 {
 ; CHECK-LABEL: @merge_local_store_4_constants_i32(
-; CHECK-NEXT:    [[OUT_GEP_2:%.*]] = getelementptr i32, i32 addrspace(3)* [[OUT:%.*]], i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(3)* [[OUT_GEP_2]] to <2 x i32> addrspace(3)*
-; CHECK-NEXT:    store <2 x i32> <i32 456, i32 333>, <2 x i32> addrspace(3)* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 addrspace(3)* [[OUT]] to <2 x i32> addrspace(3)*
-; CHECK-NEXT:    store <2 x i32> <i32 1234, i32 123>, <2 x i32> addrspace(3)* [[TMP2]], align 4
+; CHECK-NEXT:    [[OUT_GEP_2:%.*]] = getelementptr i32, ptr addrspace(3) [[OUT:%.*]], i32 2
+; CHECK-NEXT:    store <2 x i32> <i32 456, i32 333>, ptr addrspace(3) [[OUT_GEP_2]], align 4
+; CHECK-NEXT:    store <2 x i32> <i32 1234, i32 123>, ptr addrspace(3) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
+  %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(3) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(3) %out, i32 3
 
-  store i32 123, i32 addrspace(3)* %out.gep.1
-  store i32 456, i32 addrspace(3)* %out.gep.2
-  store i32 333, i32 addrspace(3)* %out.gep.3
-  store i32 1234, i32 addrspace(3)* %out
+  store i32 123, ptr addrspace(3) %out.gep.1
+  store i32 456, ptr addrspace(3) %out.gep.2
+  store i32 333, ptr addrspace(3) %out.gep.3
+  store i32 1234, ptr addrspace(3) %out
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_5_constants_i32(ptr addrspace(1) %out) {
 ; CHECK-LABEL: @merge_global_store_5_constants_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT]], i64 4
-; CHECK-NEXT:    store i32 11, i32 addrspace(1)* [[IDX4]], align 4
+; CHECK-NEXT:    store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NEXT:    store i32 11, ptr addrspace(1) [[IDX4]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  store i32 9, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 12, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 16, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 -12, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 11, i32 addrspace(1)* %idx4, align 4
+  store i32 9, ptr addrspace(1) %out, align 4
+  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
+  store i32 12, ptr addrspace(1) %idx1, align 4
+  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
+  store i32 16, ptr addrspace(1) %idx2, align 4
+  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
+  store i32 -12, ptr addrspace(1) %idx3, align 4
+  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
+  store i32 11, ptr addrspace(1) %idx4, align 4
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_6_constants_i32(ptr addrspace(1) %out) {
 ; CHECK-LABEL: @merge_global_store_6_constants_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 addrspace(1)* [[IDX4]] to <2 x i32> addrspace(1)*
-; CHECK-NEXT:    store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* [[TMP2]], align 4
+; CHECK-NEXT:    store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NEXT:    store <2 x i32> <i32 11, i32 123>, ptr addrspace(1) [[IDX4]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  store i32 13, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 15, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 62, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 63, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 11, i32 addrspace(1)* %idx4, align 4
-  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
-  store i32 123, i32 addrspace(1)* %idx5, align 4
+  store i32 13, ptr addrspace(1) %out, align 4
+  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
+  store i32 15, ptr addrspace(1) %idx1, align 4
+  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
+  store i32 62, ptr addrspace(1) %idx2, align 4
+  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
+  store i32 63, ptr addrspace(1) %idx3, align 4
+  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
+  store i32 11, ptr addrspace(1) %idx4, align 4
+  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
+  store i32 123, ptr addrspace(1) %idx5, align 4
   ret void
 }
 
-define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_7_constants_i32(ptr addrspace(1) %out) {
 ; CHECK-LABEL: @merge_global_store_7_constants_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 addrspace(1)* [[IDX4]] to <3 x i32> addrspace(1)*
-; CHECK-NEXT:    store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* [[TMP2]], align 4
-; CHECK-NEXT:    ret void
-;
-  store i32 34, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 999, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 65, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 33, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 98, i32 addrspace(1)* %idx4, align 4
-  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
-  store i32 91, i32 addrspace(1)* %idx5, align 4
-  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
-  store i32 212, i32 addrspace(1)* %idx6, align 4
-  ret void
-}
-
-define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+; CHECK-NEXT:    store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NEXT:    store <3 x i32> <i32 98, i32 91, i32 212>, ptr addrspace(1) [[IDX4]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 34, ptr addrspace(1) %out, align 4
+  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
+  store i32 999, ptr addrspace(1) %idx1, align 4
+  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
+  store i32 65, ptr addrspace(1) %idx2, align 4
+  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
+  store i32 33, ptr addrspace(1) %idx3, align 4
+  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
+  store i32 98, ptr addrspace(1) %idx4, align 4
+  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
+  store i32 91, ptr addrspace(1) %idx5, align 4
+  %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
+  store i32 212, ptr addrspace(1) %idx6, align 4
+  ret void
+}
+
+define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) %out) {
 ; CHECK-LABEL: @merge_global_store_8_constants_i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* [[TMP1]], align 4
-; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 addrspace(1)* [[IDX4]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT:    store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* [[TMP2]], align 4
-; CHECK-NEXT:    ret void
-;
-  store i32 34, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 999, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 65, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 33, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 98, i32 addrspace(1)* %idx4, align 4
-  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
-  store i32 91, i32 addrspace(1)* %idx5, align 4
-  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
-  store i32 212, i32 addrspace(1)* %idx6, align 4
-  %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
-  store i32 999, i32 addrspace(1)* %idx7, align 4
-  ret void
-}
-
-define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+; CHECK-NEXT:    store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], i64 4
+; CHECK-NEXT:    store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, ptr addrspace(1) [[IDX4]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 34, ptr addrspace(1) %out, align 4
+  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
+  store i32 999, ptr addrspace(1) %idx1, align 4
+  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
+  store i32 65, ptr addrspace(1) %idx2, align 4
+  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
+  store i32 33, ptr addrspace(1) %idx3, align 4
+  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
+  store i32 98, ptr addrspace(1) %idx4, align 4
+  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
+  store i32 91, ptr addrspace(1) %idx5, align 4
+  %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
+  store i32 212, ptr addrspace(1) %idx6, align 4
+  %idx7 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 7
+  store i32 999, ptr addrspace(1) %idx7, align 4
+  ret void
+}
+
+define amdgpu_kernel void @copy_v3i32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
 ; CHECK-LABEL: @copy_v3i32_align4(
-; CHECK-NEXT:    [[VEC:%.*]] = load <3 x i32>, <3 x i32> addrspace(1)* [[IN:%.*]], align 4
-; CHECK-NEXT:    store <3 x i32> [[VEC]], <3 x i32> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC:%.*]] = load <3 x i32>, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    store <3 x i32> [[VEC]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
-  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
-  store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
+  %vec = load <3 x i32>, ptr addrspace(1) %in, align 4
+  store <3 x i32> %vec, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
 ; CHECK-LABEL: @copy_v3i64_align4(
-; CHECK-NEXT:    [[VEC:%.*]] = load <3 x i64>, <3 x i64> addrspace(1)* [[IN:%.*]], align 4
-; CHECK-NEXT:    store <3 x i64> [[VEC]], <3 x i64> addrspace(1)* [[OUT:%.*]], align 32
+; CHECK-NEXT:    [[VEC:%.*]] = load <3 x i64>, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    store <3 x i64> [[VEC]], ptr addrspace(1) [[OUT:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
-  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
-  store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
+  %vec = load <3 x i64>, ptr addrspace(1) %in, align 4
+  store <3 x i64> %vec, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
 ; CHECK-LABEL: @copy_v3f32_align4(
-; CHECK-NEXT:    [[VEC:%.*]] = load <3 x float>, <3 x float> addrspace(1)* [[IN:%.*]], align 4
+; CHECK-NEXT:    [[VEC:%.*]] = load <3 x float>, ptr addrspace(1) [[IN:%.*]], align 4
 ; CHECK-NEXT:    [[FADD:%.*]] = fadd <3 x float> [[VEC]], <float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>
-; CHECK-NEXT:    store <3 x float> [[FADD]], <3 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT:    store <3 x float> [[FADD]], ptr addrspace(1) [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
+  %vec = load <3 x float>, ptr addrspace(1) %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
-  store <3 x float> %fadd, <3 x float> addrspace(1)* %out
+  store <3 x float> %fadd, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
 ; CHECK-LABEL: @copy_v3f64_align4(
-; CHECK-NEXT:    [[VEC:%.*]] = load <3 x double>, <3 x double> addrspace(1)* [[IN:%.*]], align 4
+; CHECK-NEXT:    [[VEC:%.*]] = load <3 x double>, ptr addrspace(1) [[IN:%.*]], align 4
 ; CHECK-NEXT:    [[FADD:%.*]] = fadd <3 x double> [[VEC]], <double 1.000000e+00, double 2.000000e+00, double 4.000000e+00>
-; CHECK-NEXT:    store <3 x double> [[FADD]], <3 x double> addrspace(1)* [[OUT:%.*]], align 32
+; CHECK-NEXT:    store <3 x double> [[FADD]], ptr addrspace(1) [[OUT:%.*]], align 32
 ; CHECK-NEXT:    ret void
 ;
-  %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
+  %vec = load <3 x double>, ptr addrspace(1) %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
-  store <3 x double> %fadd, <3 x double> addrspace(1)* %out
+  store <3 x double> %fadd, ptr addrspace(1) %out
   ret void
 }
 
 ; Verify that we no longer hit asserts for this test case. No change expected.
-define amdgpu_kernel void @copy_vec_of_ptrs(<2 x i16*> addrspace(1)* %out,
+define amdgpu_kernel void @copy_vec_of_ptrs(ptr addrspace(1) %out,
 ; CHECK-LABEL: @copy_vec_of_ptrs(
-; CHECK-NEXT:    [[IN_GEP_1:%.*]] = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* [[IN:%.*]], i32 1
-; CHECK-NEXT:    [[VEC1:%.*]] = load <2 x i16*>, <2 x i16*> addrspace(1)* [[IN_GEP_1]], align 16
-; CHECK-NEXT:    [[VEC2:%.*]] = load <2 x i16*>, <2 x i16*> addrspace(1)* [[IN]], align 4
-; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* [[OUT:%.*]], i32 1
-; CHECK-NEXT:    store <2 x i16*> [[VEC1]], <2 x i16*> addrspace(1)* [[OUT_GEP_1]], align 16
-; CHECK-NEXT:    store <2 x i16*> [[VEC2]], <2 x i16*> addrspace(1)* [[OUT]], align 4
-; CHECK-NEXT:    ret void
-;
-  <2 x i16*> addrspace(1)* %in ) #0 {
-  %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1
-  %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1
-  %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4
-
-  %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1
-  store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1
-  store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4
+; CHECK-NEXT:    [[IN_GEP_1:%.*]] = getelementptr <2 x ptr>, ptr addrspace(1) [[IN:%.*]], i32 1
+; CHECK-NEXT:    [[VEC1:%.*]] = load <2 x ptr>, ptr addrspace(1) [[IN_GEP_1]], align 16
+; CHECK-NEXT:    [[VEC2:%.*]] = load <2 x ptr>, ptr addrspace(1) [[IN]], align 4
+; CHECK-NEXT:    [[OUT_GEP_1:%.*]] = getelementptr <2 x ptr>, ptr addrspace(1) [[OUT:%.*]], i32 1
+; CHECK-NEXT:    store <2 x ptr> [[VEC1]], ptr addrspace(1) [[OUT_GEP_1]], align 16
+; CHECK-NEXT:    store <2 x ptr> [[VEC2]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  ptr addrspace(1) %in ) #0 {
+  %in.gep.1 = getelementptr <2 x ptr>, ptr addrspace(1) %in, i32 1
+  %vec1 = load <2 x ptr>, ptr addrspace(1) %in.gep.1
+  %vec2 = load <2 x ptr>, ptr addrspace(1) %in, align 4
+
+  %out.gep.1 = getelementptr <2 x ptr>, ptr addrspace(1) %out, i32 1
+  store <2 x ptr> %vec1, ptr addrspace(1) %out.gep.1
+  store <2 x ptr> %vec2, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
index b7863b383134..7bfd6d59b270 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
@@ -8,37 +8,35 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 ; The original load has an implicit alignment of 4, and should not
 ; increase to an align 8 load.
 
-define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
+define amdgpu_kernel void @load_keep_base_alignment_missing_align(ptr addrspace(1) %out) {
 ; CHECK-LABEL: @load_keep_base_alignment_missing_align(
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float addrspace(3)* [[PTR0]] to <2 x float> addrspace(3)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(3)* [[TMP1]], align 4
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 11
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(3) [[PTR0]], align 4
 ; CHECK-NEXT:    [[VAL01:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[VAL12:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[VAL01]], [[VAL12]]
-; CHECK-NEXT:    store float [[ADD]], float addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store float [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
-  %val0 = load float, float addrspace(3)* %ptr0
+  %ptr0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 11
+  %val0 = load float, ptr addrspace(3) %ptr0
 
-  %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12
-  %val1 = load float, float addrspace(3)* %ptr1
+  %ptr1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 12
+  %val1 = load float, ptr addrspace(3) %ptr1
   %add = fadd float %val0, %val1
-  store float %add, float addrspace(1)* %out
+  store float %add, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @store_keep_base_alignment_missing_align() {
 ; CHECK-LABEL: @store_keep_base_alignment_missing_align(
-; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float addrspace(3)* [[ARRAYIDX0]] to <2 x float> addrspace(3)*
-; CHECK-NEXT:    store <2 x float> zeroinitializer, <2 x float> addrspace(3)* [[TMP1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 1
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(3) [[ARRAYIDX0]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1
-  %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
-  store float 0.0, float addrspace(3)* %arrayidx0
-  store float 0.0, float addrspace(3)* %arrayidx1
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 1
+  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 2
+  store float 0.0, ptr addrspace(3) %arrayidx0
+  store float 0.0, ptr addrspace(3) %arrayidx1
   ret void
 }

diff  --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll
index ecaaccfebc04..4da4ee49f29f 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/propagate-invariance-metadata.ll
@@ -4,21 +4,20 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-define i32 @foo(i32* %ptr) {
+define i32 @foo(ptr %ptr) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 1
-; CHECK-NEXT:    [[P1:%.*]] = addrspacecast i32* [[PTR1]] to i32 addrspace(1)*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[PTR]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 8, !invariant.load !0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i32 1
+; CHECK-NEXT:    [[P1:%.*]] = addrspacecast ptr [[PTR1]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[PTR]], align 8, !invariant.load !0
 ; CHECK-NEXT:    [[V01:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[V12:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[V01]], [[V12]]
 ; CHECK-NEXT:    ret i32 [[SUM]]
 ;
-  %ptr1 = getelementptr i32, i32* %ptr, i32 1
-  %p1 = addrspacecast i32* %ptr1 to i32 addrspace(1)*
-  %v0 = load i32, i32* %ptr, align 8, !invariant.load !0
-  %v1 = load i32, i32* %ptr1, align 4, !invariant.load !0
+  %ptr1 = getelementptr i32, ptr %ptr, i32 1
+  %p1 = addrspacecast ptr %ptr1 to ptr addrspace(1)
+  %v0 = load i32, ptr %ptr, align 8, !invariant.load !0
+  %v1 = load i32, ptr %ptr1, align 4, !invariant.load !0
   %sum = add i32 %v0, %v1
   ret i32 %sum
 }

diff  --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
index bff0b95a7a76..3dea7f3520c0 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll
@@ -4,30 +4,26 @@
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
-define void @correct_order(i32* noalias %ptr) {
+define void @correct_order(ptr noalias %ptr) {
 ; CHECK-LABEL: @correct_order(
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i32, i32* [[PTR]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP1]] to <2 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[NEXT_GEP1]], align 4
 ; CHECK-NEXT:    [[L11:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[L42:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-; CHECK-NEXT:    [[L2:%.*]] = load i32, i32* [[NEXT_GEP]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[NEXT_GEP]] to <2 x i32>*
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, <2 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[L3:%.*]] = load i32, i32* [[NEXT_GEP1]], align 4
+; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[NEXT_GEP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %next.gep = getelementptr i32, i32* %ptr, i64 0
-  %next.gep1 = getelementptr i32, i32* %ptr, i64 1
-  %next.gep2 = getelementptr i32, i32* %ptr, i64 2
+  %next.gep1 = getelementptr i32, ptr %ptr, i64 1
+  %next.gep2 = getelementptr i32, ptr %ptr, i64 2
 
-  %l1 = load i32, i32* %next.gep1, align 4
-  %l2 = load i32, i32* %next.gep, align 4
-  store i32 0, i32* %next.gep1, align 4
-  store i32 0, i32* %next.gep, align 4
-  %l3 = load i32, i32* %next.gep1, align 4
-  %l4 = load i32, i32* %next.gep2, align 4
+  %l1 = load i32, ptr %next.gep1, align 4
+  %l2 = load i32, ptr %ptr, align 4
+  store i32 0, ptr %next.gep1, align 4
+  store i32 0, ptr %ptr, align 4
+  %l3 = load i32, ptr %next.gep1, align 4
+  %l4 = load i32, ptr %next.gep2, align 4
 
   ret void
 }

diff  --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
index 6c340efa85a0..678a10855ae0 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/non-byte-size.ll
@@ -6,26 +6,22 @@
 
 ; We currently do not optimize this scenario.
 ; But we verify that we no longer crash when compiling this.
-define void @test1(%rec* %out, %rec* %in) {
+define void @test1(ptr %out, ptr %in) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[IN1:%.*]] = getelementptr [[REC:%.*]], %rec* [[IN:%.*]], i16 0, i32 0
-; CHECK-NEXT:    [[IN2:%.*]] = getelementptr [[REC]], %rec* [[IN]], i16 0, i32 1
-; CHECK-NEXT:    [[VAL1:%.*]] = load i32, i32* [[IN1]], align 8
-; CHECK-NEXT:    [[VAL2:%.*]] = load i28, i28* [[IN2]], align 4
-; CHECK-NEXT:    [[OUT1:%.*]] = getelementptr [[REC]], %rec* [[OUT:%.*]], i16 0, i32 0
-; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr [[REC]], %rec* [[OUT]], i16 0, i32 1
-; CHECK-NEXT:    store i32 [[VAL1]], i32* [[OUT1]], align 8
-; CHECK-NEXT:    store i28 [[VAL2]], i28* [[OUT2]], align 4
+; CHECK-NEXT:    [[IN2:%.*]] = getelementptr [[REC:%.*]], ptr [[IN:%.*]], i16 0, i32 1
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[IN]], align 8
+; CHECK-NEXT:    [[VAL2:%.*]] = load i28, ptr [[IN2]], align 4
+; CHECK-NEXT:    [[OUT2:%.*]] = getelementptr [[REC]], ptr [[OUT:%.*]], i16 0, i32 1
+; CHECK-NEXT:    store i32 [[VAL1]], ptr [[OUT]], align 8
+; CHECK-NEXT:    store i28 [[VAL2]], ptr [[OUT2]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  %in1 = getelementptr %rec, %rec* %in, i16 0, i32 0
-  %in2 = getelementptr %rec, %rec* %in, i16 0, i32 1
-  %val1 = load i32, i32* %in1, align 8
-  %val2 = load i28, i28* %in2
-  %out1 = getelementptr %rec, %rec* %out, i16 0, i32 0
-  %out2 = getelementptr %rec, %rec* %out, i16 0, i32 1
-  store i32 %val1, i32* %out1, align 8
-  store i28 %val2, i28* %out2
+  %in2 = getelementptr %rec, ptr %in, i16 0, i32 1
+  %val1 = load i32, ptr %in, align 8
+  %val2 = load i28, ptr %in2
+  %out2 = getelementptr %rec, ptr %out, i16 0, i32 1
+  store i32 %val1, ptr %out, align 8
+  store i28 %val2, ptr %out2
   ret void
 }