[llvm] d85e849 - AMDGPU: Convert some assorted tests to opaque pointers

Thu Dec 1 18:40:39 PST 2022

Author: Matt Arsenault
Date: 2022-12-01T21:40:30-05:00
New Revision: d85e849ff4d5e03ed83ee10e56074f238ba444d3

URL: https://github.com/llvm/llvm-project/commit/d85e849ff4d5e03ed83ee10e56074f238ba444d3
DIFF: https://github.com/llvm/llvm-project/commit/d85e849ff4d5e03ed83ee10e56074f238ba444d3.diff

LOG: AMDGPU: Convert some assorted tests to opaque pointers

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/aa-points-to-constant-memory.ll
    llvm/test/CodeGen/AMDGPU/acc-ldst.ll
    llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-unroll-threshold.ll
    llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
    llvm/test/CodeGen/AMDGPU/anonymous-gv.ll
    llvm/test/CodeGen/AMDGPU/are-loads-from-same-base-ptr.ll
    llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
    llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
    llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
    llvm/test/CodeGen/AMDGPU/bfe-combine.ll
    llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
    llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
    llvm/test/CodeGen/AMDGPU/cc-update.ll
    llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
    llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
    llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
    llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
    llvm/test/CodeGen/AMDGPU/cluster_stores.ll
    llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
    llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
    llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
    llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
    llvm/test/CodeGen/AMDGPU/combine_vloads.ll
    llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
    llvm/test/CodeGen/AMDGPU/ctpop.ll
    llvm/test/CodeGen/AMDGPU/ctpop16.ll
    llvm/test/CodeGen/AMDGPU/ctpop64.ll
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
    llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
    llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
    llvm/test/CodeGen/AMDGPU/dead-machine-elim-after-dead-lane.ll
    llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll
    llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
    llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
    llvm/test/CodeGen/AMDGPU/dpp_combine.ll
    llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
    llvm/test/CodeGen/AMDGPU/ds-alignment.ll
    llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll
    llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
    llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
    llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
    llvm/test/CodeGen/AMDGPU/early-if-convert.ll
    llvm/test/CodeGen/AMDGPU/early-inline-alias.ll
    llvm/test/CodeGen/AMDGPU/early-inline.ll
    llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
    llvm/test/CodeGen/AMDGPU/extra-sroa-after-unroll.ll
    llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
    llvm/test/CodeGen/AMDGPU/fceil.ll
    llvm/test/CodeGen/AMDGPU/fceil64.ll
    llvm/test/CodeGen/AMDGPU/fence-barrier.ll
    llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
    llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll
    llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll
    llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
    llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
    llvm/test/CodeGen/AMDGPU/function-args.ll
    llvm/test/CodeGen/AMDGPU/function-call-relocs.ll
    llvm/test/CodeGen/AMDGPU/function-returns.ll
    llvm/test/CodeGen/AMDGPU/gds-allocation.ll
    llvm/test/CodeGen/AMDGPU/gds-atomic.ll
    llvm/test/CodeGen/AMDGPU/gep-address-space.ll
    llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
    llvm/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
    llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
    llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll
    llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
    llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
    llvm/test/CodeGen/AMDGPU/hoist-cond.ll
    llvm/test/CodeGen/AMDGPU/hsa-default-device.ll
    llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
    llvm/test/CodeGen/AMDGPU/hsa-func-align.ll
    llvm/test/CodeGen/AMDGPU/hsa-func.ll
    llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll
    llvm/test/CodeGen/AMDGPU/internalize.ll
    llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
    llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
    llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
    llvm/test/CodeGen/AMDGPU/ipra.ll
    llvm/test/CodeGen/AMDGPU/jump-address.ll
    llvm/test/CodeGen/AMDGPU/kernarg-size.ll
    llvm/test/CodeGen/AMDGPU/kernel-args.ll
    llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
    llvm/test/CodeGen/AMDGPU/knownbits-recursion.ll
    llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll
    llvm/test/CodeGen/AMDGPU/lcssa-optnone.ll
    llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
    llvm/test/CodeGen/AMDGPU/mad-combine.ll
    llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll
    llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.ll
    llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
    llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
    llvm/test/CodeGen/AMDGPU/merge-stores.ll
    llvm/test/CodeGen/AMDGPU/mesa3d.ll
    llvm/test/CodeGen/AMDGPU/missing-store.ll
    llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
    llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
    llvm/test/CodeGen/AMDGPU/move-to-valu-worklist.ll
    llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
    llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
    llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
    llvm/test/CodeGen/AMDGPU/mubuf.ll
    llvm/test/CodeGen/AMDGPU/nand.ll
    llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
    llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
    llvm/test/CodeGen/AMDGPU/nop-data.ll
    llvm/test/CodeGen/AMDGPU/nullptr.ll
    llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
    llvm/test/CodeGen/AMDGPU/omod.ll
    llvm/test/CodeGen/AMDGPU/operand-spacing.ll
    llvm/test/CodeGen/AMDGPU/optimize-compare.ll
    llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
    llvm/test/CodeGen/AMDGPU/predicates.ll
    llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
    llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
    llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
    llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
    llvm/test/CodeGen/AMDGPU/recursion.ll
    llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
    llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
    llvm/test/CodeGen/AMDGPU/reorder-stores.ll
    llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
    llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
    llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll
    llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
    llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
    llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
    llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
    llvm/test/CodeGen/AMDGPU/sad.ll
    llvm/test/CodeGen/AMDGPU/saddo.ll
    llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
    llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
    llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
    llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
    llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
    llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
    llvm/test/CodeGen/AMDGPU/simplify-libcalls2.ll
    llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll
    llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
    llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
    llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll
    llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
    llvm/test/CodeGen/AMDGPU/ssubo.ll
    llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
    llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
    llvm/test/CodeGen/AMDGPU/swdev282079.ll
    llvm/test/CodeGen/AMDGPU/tail-call-cgp.ll
    llvm/test/CodeGen/AMDGPU/tail-duplication-convergent.ll
    llvm/test/CodeGen/AMDGPU/target-mem-intrinsic-metadata.ll
    llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
    llvm/test/CodeGen/AMDGPU/trap-abis.ll
    llvm/test/CodeGen/AMDGPU/trap.ll
    llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
    llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
    llvm/test/CodeGen/AMDGPU/uaddo.ll
    llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
    llvm/test/CodeGen/AMDGPU/unknown-processor.ll
    llvm/test/CodeGen/AMDGPU/unroll.ll
    llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
    llvm/test/CodeGen/AMDGPU/unsupported-cc.ll
    llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
    llvm/test/CodeGen/AMDGPU/usubo.ll
    llvm/test/CodeGen/AMDGPU/v1024.ll
    llvm/test/CodeGen/AMDGPU/v_cndmask.ll
    llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
    llvm/test/CodeGen/AMDGPU/valu-i1.ll
    llvm/test/CodeGen/AMDGPU/vop-shrink.ll
    llvm/test/CodeGen/AMDGPU/wave32.ll
    llvm/test/CodeGen/AMDGPU/wqm.ll
    llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/aa-points-to-constant-memory.ll b/llvm/test/CodeGen/AMDGPU/aa-points-to-constant-memory.ll
index d240b0819e8b..4e945951dab6 100644

--- a/llvm/test/CodeGen/AMDGPU/aa-points-to-constant-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/aa-points-to-constant-memory.ll
@@ -5,89 +5,89 @@
 ; turn out to be stores to constant memory, and will therefore be
 ; deleted as UB.
 
-define void @test_constant_addrspace(i8 addrspace(4)* %p) {
+define void @test_constant_addrspace(ptr addrspace(4) %p) {
 ; CHECK-LABEL: @test_constant_addrspace(
 ; CHECK-NEXT:    ret void
 ;
-  store i8 0, i8 addrspace(4)* %p
+  store i8 0, ptr addrspace(4) %p
   ret void
 }
 
-define void @test_constant32bit_addrspace(i8 addrspace(6)* %p) {
+define void @test_constant32bit_addrspace(ptr addrspace(6) %p) {
 ; CHECK-LABEL: @test_constant32bit_addrspace(
 ; CHECK-NEXT:    ret void
 ;
-  store i8 0, i8 addrspace(6)* %p
+  store i8 0, ptr addrspace(6) %p
   ret void
 }
 
-define void @test_cast_generic_from_constant_addrspace(i8 addrspace(4)* %p) {
+define void @test_cast_generic_from_constant_addrspace(ptr addrspace(4) %p) {
 ; CHECK-LABEL: @test_cast_generic_from_constant_addrspace(
 ; CHECK-NEXT:    ret void
 ;
-  %cast = addrspacecast i8 addrspace(4)* %p to i8*
-  store i8 0, i8* %cast
+  %cast = addrspacecast ptr addrspace(4) %p to ptr
+  store i8 0, ptr %cast
   ret void
 }
 
-define void @test_cast_generic_from_constant32bit_addrspace(i8 addrspace(6)* %p) {
+define void @test_cast_generic_from_constant32bit_addrspace(ptr addrspace(6) %p) {
 ; CHECK-LABEL: @test_cast_generic_from_constant32bit_addrspace(
 ; CHECK-NEXT:    ret void
 ;
-  %cast = addrspacecast i8 addrspace(6)* %p to i8*
-  store i8 0, i8* %cast
+  %cast = addrspacecast ptr addrspace(6) %p to ptr
+  store i8 0, ptr %cast
   ret void
 }
 
-define void @test_cast_generic_to_constant_addrspace(i8* %p) {
+define void @test_cast_generic_to_constant_addrspace(ptr %p) {
 ; CHECK-LABEL: @test_cast_generic_to_constant_addrspace(
 ; CHECK-NEXT:    ret void
 ;
-  %cast = addrspacecast i8* %p to i8 addrspace(4)*
-  store i8 0, i8 addrspace(4)* %cast
+  %cast = addrspacecast ptr %p to ptr addrspace(4)
+  store i8 0, ptr addrspace(4) %cast
   ret void
 }
 
-define void @test_cast_generic_to_constant32bit_addrspace(i8* %p) {
+define void @test_cast_generic_to_constant32bit_addrspace(ptr %p) {
 ; CHECK-LABEL: @test_cast_generic_to_constant32bit_addrspace(
 ; CHECK-NEXT:    ret void
 ;
-  %cast = addrspacecast i8* %p to i8 addrspace(6)*
-  store i8 0, i8 addrspace(6)* %cast
+  %cast = addrspacecast ptr %p to ptr addrspace(6)
+  store i8 0, ptr addrspace(6) %cast
   ret void
 }
 
-define amdgpu_kernel void @noalias_readnone_global_kernarg(i32 addrspace(1)* noalias readnone %arg) {
+define amdgpu_kernel void @noalias_readnone_global_kernarg(ptr addrspace(1) noalias readnone %arg) {
 ; CHECK-LABEL: @noalias_readnone_global_kernarg(
 ; CHECK-NEXT:    ret void
 ;
-  store i32 0, i32 addrspace(1)* %arg
+  store i32 0, ptr addrspace(1) %arg
   ret void
 }
 
-define amdgpu_kernel void @noalias_readonly_global_kernarg(i32 addrspace(1)* noalias readonly %arg) {
+define amdgpu_kernel void @noalias_readonly_global_kernarg(ptr addrspace(1) noalias readonly %arg) {
 ; CHECK-LABEL: @noalias_readonly_global_kernarg(
 ; CHECK-NEXT:    ret void
 ;
-  store i32 0, i32 addrspace(1)* %arg
+  store i32 0, ptr addrspace(1) %arg
   ret void
 }
 
-define amdgpu_kernel void @readnone_global_kernarg(i32 addrspace(1)* readnone %arg) {
+define amdgpu_kernel void @readnone_global_kernarg(ptr addrspace(1) readnone %arg) {
 ; CHECK-LABEL: @readnone_global_kernarg(
-; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[ARG:%.*]], align 4
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[ARG:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  store i32 0, i32 addrspace(1)* %arg
+  store i32 0, ptr addrspace(1) %arg
   ret void
 }
 
-define amdgpu_kernel void @readonly_global_kernarg(i32 addrspace(1)* readonly %arg) {
+define amdgpu_kernel void @readonly_global_kernarg(ptr addrspace(1) readonly %arg) {
 ; CHECK-LABEL: @readonly_global_kernarg(
-; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[ARG:%.*]], align 4
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[ARG:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  store i32 0, i32 addrspace(1)* %arg
+  store i32 0, ptr addrspace(1) %arg
   ret void
 }
 
@@ -97,7 +97,7 @@ define amdgpu_kernel void @constant_gv_global_as() {
 ; CHECK-LABEL: @constant_gv_global_as(
 ; CHECK-NEXT:    ret void
 ;
-  store i32 0, i32 addrspace(1)* @global_as_constant
+  store i32 0, ptr addrspace(1) @global_as_constant
   ret void
 }
 
@@ -107,6 +107,6 @@ define amdgpu_kernel void @nonconst_gv_constant_as() {
 ; CHECK-LABEL: @nonconst_gv_constant_as(
 ; CHECK-NEXT:    ret void
 ;
-  store i32 0, i32 addrspace(4)* @global_nonconstant_constant_as
+  store i32 0, ptr addrspace(4) @global_nonconstant_constant_as
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index 7cacdefeb3a1..9e4569afd6f2 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -13,13 +13,13 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 ; GCN-NEXT:    s_nop 2
 ; GCN-NOT:     v_accvgpr_read
 ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
-define amdgpu_kernel void @test_load_mfma_store16(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_load_mfma_store16(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = load <32 x float>, ptr addrspace(1) %gep
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
+  store <32 x float> %mai.1, ptr addrspace(1) %gep
   ret void
 }
 
@@ -32,15 +32,15 @@ bb:
 ; GCN-NEXT: s_nop 2
 ; GCN-NOT:  v_accvgpr_read
 ; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
-define amdgpu_kernel void @test_load1_mfma_store1(float addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_load1_mfma_store1(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid
-  %in.1 = load float, float addrspace(1)* %gep
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = load float, ptr addrspace(1) %gep
   %init = insertelement <32 x float> zeroinitializer, float %in.1, i32 0
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %init, i32 1, i32 2, i32 3)
   %elt = extractelement <32 x float> %mai.1, i32 0
-  store float %elt, float addrspace(1)* %gep
+  store float %elt, ptr addrspace(1) %gep
   ret void
 }
 
@@ -51,13 +51,13 @@ bb:
 ; GCN-NEXT: s_nop 4
 ; GCN-NOT:  v_accvgpr_read
 ; GCN-NEXT: global_store_dwordx4 v{{[0-9:]+}}, [[A]], s[{{[0-9:]+}}]
-define amdgpu_kernel void @test_load4_mfma_store4(<4 x i32> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_load4_mfma_store4(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %gep
+  %gep = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = load <4 x i32>, ptr addrspace(1) %gep
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0)
-  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep
+  store <4 x i32> %mai.1, ptr addrspace(1) %gep
   ret void
 }
 
@@ -65,13 +65,13 @@ bb:
 ; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
 ; GCN-NOT:     v_accvgpr
 ; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
-define amdgpu_kernel void @test_load_store(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_load_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
-  %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1
-  store <32 x float> %in.1, <32 x float> addrspace(1)* %gep.2
+  %gep.1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
+  %gep.2 = getelementptr inbounds <32 x float>, ptr addrspace(1) %gep.1, i32 32
+  %in.1 = load <32 x float>, ptr addrspace(1) %gep.1
+  store <32 x float> %in.1, ptr addrspace(1) %gep.2
   ret void
 }
 
@@ -84,14 +84,14 @@ bb:
 ; GCN-NEXT:     s_nop 2
 ; GCN-NOT:      v_accvgpr_read
 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_load_add_mfma_store(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_load_add_mfma_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = load <32 x float>, ptr addrspace(1) %gep
   %add.1 = fadd <32 x float> %in.1, %in.1
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
+  store <32 x float> %mai.1, ptr addrspace(1) %gep
   ret void
 }
 
@@ -101,13 +101,13 @@ bb:
 ; GCN-COUNT-16: v_pk_add_f32
 ; GCN-NOT:      v_accvgpr
 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
-define amdgpu_kernel void @test_load_add_store(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_load_add_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = load <32 x float>, ptr addrspace(1) %gep
   %add.1 = fadd <32 x float> %in.1, %in.1
-  store <32 x float> %add.1, <32 x float> addrspace(1)* %gep
+  store <32 x float> %add.1, ptr addrspace(1) %gep
   ret void
 }
 
@@ -118,14 +118,14 @@ bb:
 ; GCN-COUNT-32: v_accvgpr_read
 ; GCN:          v_pk_add_f32
 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
-define amdgpu_kernel void @test_load_mfma_add_store(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_load_mfma_add_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = load <32 x float>, ptr addrspace(1) %gep
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
   %add.1 = fadd <32 x float> %mai.1, %in.1
-  store <32 x float> %add.1, <32 x float> addrspace(1)* %gep
+  store <32 x float> %add.1, ptr addrspace(1) %gep
   ret void
 }
 
@@ -137,15 +137,15 @@ bb:
 ; GCN-COUNT-32: v_accvgpr_read
 ; GCN:          v_pk_mul_f32
 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
-define amdgpu_kernel void @test_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_load_add_mfma_mul_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = load <32 x float>, ptr addrspace(1) %gep
   %add.1 = fadd <32 x float> %in.1, %in.1
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
   %mul.1 = fmul <32 x float> %mai.1, %mai.1
-  store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep
+  store <32 x float> %mul.1, ptr addrspace(1) %gep
   ret void
 }
 
@@ -156,15 +156,15 @@ bb:
 ; GCN-COUNT-32: v_accvgpr_read
 ; GCN:          v_pk_mul_f32
 ; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
-define amdgpu_kernel void @test_mixeduse_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mixeduse_load_add_mfma_mul_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = load <32 x float>, ptr addrspace(1) %gep
   %add.1 = fadd <32 x float> %in.1, %in.1
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
   %mul.1 = fmul <32 x float> %mai.1, %in.1
-  store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep
+  store <32 x float> %mul.1, ptr addrspace(1) %gep
   ret void
 }
 
@@ -174,16 +174,16 @@ bb:
 ; GCN:         v_mfma_f32_32x32x1f32
 ; GCN-NOT:     v_accvgpr_read
 ; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
-define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
-  %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32
-  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1
+  %gep.1 = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
+  %gep.2 = getelementptr inbounds <32 x float>, ptr addrspace(1) %gep.1, i32 32
+  %in.1 = load <32 x float>, ptr addrspace(1) %gep.1
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
   %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep.1
-  store <32 x float> %mai.2, <32 x float> addrspace(1)* %gep.2
+  store <32 x float> %mai.1, ptr addrspace(1) %gep.1
+  store <32 x float> %mai.2, ptr addrspace(1) %gep.2
   ret void
 }
 
@@ -198,19 +198,19 @@ bb:
 ; GCN:     v_accvgpr_read_b32 [[V:v[0-9]+]], a[[N]]{{$}}
 ; GCN:     global_atomic_add v{{[0-9]+}}, v{{[0-9:]+}}, [[V]], s[{{[0-9:]+}}] glc
 ; GCN:     global_store_dword v{{[0-9]+}}, v{{[0-9]+}},
-define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic_store(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tid
-  %in.1 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i32 1 seq_cst
   %tmp0 = insertelement <4 x i32> undef, i32 %in.1, i32 0
   %tmp1 = insertelement <4 x i32> %tmp0, i32 0, i32 1
   %tmp2 = insertelement <4 x i32> %tmp1, i32 0, i32 2
   %tmp3 = insertelement <4 x i32> %tmp2, i32 0, i32 3
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp3, i32 0, i32 0, i32 0)
   %elt = extractelement <4 x i32> %mai.1, i32 0
-  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %elt seq_cst
-  store i32 %val, i32 addrspace(1)* %arg
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %elt seq_cst
+  store i32 %val, ptr addrspace(1) %arg
   ret void
 }
 
@@ -221,11 +221,11 @@ bb:
 ; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
 ; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
 ; GCN:         global_atomic_add_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
-define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic64_store(i64 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic64_store(ptr addrspace(1) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid
-  %in.1 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 1 seq_cst
+  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tid
+  %in.1 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 1 seq_cst
   %tmp0 = insertelement <2 x i64> undef, i64 %in.1, i32 0
   %tmp1 = insertelement <2 x i64> %tmp0, i64 0, i32 1
   %tmp2 = bitcast <2 x i64> %tmp0 to <4 x i32>
@@ -235,8 +235,8 @@ bb:
   %v2.1 = insertelement <2 x i32> undef, i32 %elt.1, i32 0
   %v2.2 = insertelement <2 x i32> %v2.1, i32 %elt.2, i32 1
   %v2 = bitcast <2 x i32> %v2.2 to i64
-  %val = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %v2 seq_cst
-  store i64 %val, i64 addrspace(1)* %arg
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i64 %v2 seq_cst
+  store i64 %val, ptr addrspace(1) %arg
   ret void
 }
 
@@ -248,17 +248,16 @@ bb:
 ; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-NOT: v_accvgpr_read
 ; GCN:     ds_write_b32 v{{[0-9]+}}, a[[N]] offset:128
-define amdgpu_kernel void @test_load_mfma_ds2_store(<4 x i32> addrspace(3)* %arg) #0 {
+define amdgpu_kernel void @test_load_mfma_ds2_store(ptr addrspace(3) %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(3)* %arg, i32 %tid
-  %in.1 = load <4 x i32>, <4 x i32> addrspace(3)* %gep.1
+  %gep.1 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %arg, i32 %tid
+  %in.1 = load <4 x i32>, ptr addrspace(3) %gep.1
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0)
   %elt = extractelement <4 x i32> %mai.1, i32 0
-  %ptr = bitcast <4 x i32> addrspace(3)* %arg to i32 addrspace(3)*
-  %gep.2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 32
-  store i32 1, i32 addrspace(3)* %ptr
-  store i32 %elt, i32 addrspace(3)* %gep.2
+  %gep.2 = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 32
+  store i32 1, ptr addrspace(3) %arg
+  store i32 %elt, ptr addrspace(3) %gep.2
   ret void
 }
 
@@ -268,11 +267,11 @@ bb:
 ; GCN:     v_mfma_i32_4x4x4i8 [[RES:a\[[0-9:]+\]]], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]]
 ; GCN-NOT: v_accvgpr_read
 ; GCN:     global_store_dwordx4 v[{{[0-9:]+}}], [[RES]],
-define amdgpu_kernel void @test_mfma_loop_4xi32(<4 x i32> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_loop_4xi32(ptr addrspace(1) %arg) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid
-  %in = load <4 x i32>, <4 x i32> addrspace(1)* %gep
+  %gep = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i32 %tid
+  %in = load <4 x i32>, ptr addrspace(1) %gep
   br label %for.cond.preheader
 
 for.cond.preheader:
@@ -284,7 +283,7 @@ for.cond.preheader:
   br i1 %cc, label %exit, label %for.cond.preheader
 
 exit:
-  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep
+  store <4 x i32> %mai.1, ptr addrspace(1) %gep
   ret void
 }
 
@@ -295,11 +294,11 @@ exit:
 ; GCN-NOT:     v_accvgpr_read
 ; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}],
 ; GCN:         s_endpgm
-define amdgpu_kernel void @test_mfma_loop_32xfloat(<32 x float> addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mfma_loop_32xfloat(ptr addrspace(1) %arg) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
-  %in = load <32 x float>, <32 x float> addrspace(1)* %gep
+  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid
+  %in = load <32 x float>, ptr addrspace(1) %gep
   br label %for.cond.preheader
 
 for.cond.preheader:
@@ -311,7 +310,7 @@ for.cond.preheader:
   br i1 %cc, label %exit, label %for.cond.preheader
 
 exit:
-  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
+  store <32 x float> %mai.1, ptr addrspace(1) %gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index d79e70486d15..7c0d7b6ea236 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -8,7 +8,7 @@
 ; from a register.
 ; GCN-LABEL: name: test_load_zext
 ; GCN: %[[OFFSET:[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @DescriptorBuffer
-; SDAG: %{{[0-9]+}}:sgpr_128 = S_LOAD_DWORDX4_SGPR killed %{{[0-9]+}}, killed %[[OFFSET]], 0 :: (invariant load (s128) from %ir.13, addrspace 4)
+; SDAG: %{{[0-9]+}}:sgpr_128 = S_LOAD_DWORDX4_SGPR killed %{{[0-9]+}}, killed %[[OFFSET]], 0 :: (invariant load (s128) from %ir.12, addrspace 4)
 ; GISEL: %{{[0-9]+}}:sgpr_128 = S_LOAD_DWORDX4_SGPR %{{[0-9]+}}, %[[OFFSET]], 0 :: (invariant load (<4 x s32>) from {{.*}}, addrspace 4)
 define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %resNode0, i32 inreg %resNode1, <3 x i32> inreg %2, i32 inreg %3, <3 x i32> %4) local_unnamed_addr #2 {
 .entry:
@@ -16,14 +16,13 @@ define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %res
   %6 = bitcast i64 %5 to <2 x i32>
   %7 = insertelement <2 x i32> %6, i32 %resNode0, i32 0
   %8 = bitcast <2 x i32> %7 to i64
-  %9 = inttoptr i64 %8 to [4294967295 x i8] addrspace(4)*
+  %9 = inttoptr i64 %8 to ptr addrspace(4)
   %10 = call i32 @llvm.amdgcn.reloc.constant(metadata !4)
   %11 = zext i32 %10 to i64
-  %12 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %9, i64 0, i64 %11
-  %13 = bitcast i8 addrspace(4)* %12 to <4 x i32> addrspace(4)*, !amdgpu.uniform !5
-  %14 = load <4 x i32>, <4 x i32> addrspace(4)* %13, align 16, !invariant.load !5
-  %15 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %14, i32 0, i32 0)
-  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %15, <4 x i32> %14, i32 0, i32 0, i32 0)
+  %12 = getelementptr [4294967295 x i8], ptr addrspace(4) %9, i64 0, i64 %11
+  %13 = load <4 x i32>, ptr addrspace(4) %12, align 16, !invariant.load !5
+  %14 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %13, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %14, <4 x i32> %13, i32 0, i32 0, i32 0)
   ret void
 }
 
@@ -34,13 +33,13 @@ define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %res
 ; GCN-DAG: %[[OFFSET:.*]]:sreg_32 = S_LSHL_B32
 ; SDAG: S_LOAD_DWORD_SGPR killed %[[BASE]], killed %[[OFFSET]],
 ; GISEL: S_LOAD_DWORD_SGPR %[[BASE]], %[[OFFSET]],
-define amdgpu_ps void @test_complex_reg_offset(float addrspace(1)* %out) {
-  %i = load i32, i32 addrspace(4)* @1
+define amdgpu_ps void @test_complex_reg_offset(ptr addrspace(1) %out) {
+  %i = load i32, ptr addrspace(4) @1
   %i1 = and i32 %i, 3
   %i2 = zext i32 %i1 to i64
-  %i3 = getelementptr [4 x <2 x float>], [4 x <2 x float>] addrspace(4)* @0, i64 0, i64 %i2, i64 0
-  %i4 = load float, float addrspace(4)* %i3, align 4
-  store float %i4, float addrspace(1)* %out
+  %i3 = getelementptr [4 x <2 x float>], ptr addrspace(4) @0, i64 0, i64 %i2, i64 0
+  %i4 = load float, ptr addrspace(4) %i3, align 4
+  store float %i4, ptr addrspace(1) %out
   ret void
 }
 
@@ -55,14 +54,13 @@ define amdgpu_ps void @test_complex_reg_offset(float addrspace(1)* %out) {
 ; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr2
 ; GISEL-DAG: %[[BASE:.*]]:sreg_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
 ; GISEL: S_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OFFSET]], 16,
-define amdgpu_ps void @test_sgpr_plus_imm_offset(i8 addrspace(4)* inreg %base, i32 inreg %offset,
-                                                 i32 addrspace(1)* inreg %out) {
-  %v1 = getelementptr i8, i8 addrspace(4)* %base, i64 16
+define amdgpu_ps void @test_sgpr_plus_imm_offset(ptr addrspace(4) inreg %base, i32 inreg %offset,
+                                                 ptr addrspace(1) inreg %out) {
+  %v1 = getelementptr i8, ptr addrspace(4) %base, i64 16
   %v2 = zext i32 %offset to i64
-  %v3 = getelementptr i8, i8 addrspace(4)* %v1, i64 %v2
-  %v4 = bitcast i8 addrspace(4)* %v3 to i32 addrspace(4)*
-  %v5 = load i32, i32 addrspace(4)* %v4, align 4
-  store i32 %v5, i32 addrspace(1)* %out, align 4
+  %v3 = getelementptr i8, ptr addrspace(4) %v1, i64 %v2
+  %v5 = load i32, ptr addrspace(4) %v3, align 4
+  store i32 %v5, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -77,14 +75,13 @@ define amdgpu_ps void @test_sgpr_plus_imm_offset(i8 addrspace(4)* inreg %base, i
 ; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr2
 ; GISEL-DAG: %[[BASE:.*]]:sreg_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
 ; GISEL: S_LOAD_DWORDX2_SGPR_IMM %[[BASE]], %[[OFFSET]], 16,
-define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(i8 addrspace(4)* inreg %base, i32 inreg %offset,
-                                                    <2 x i32> addrspace(1)* inreg %out) {
-  %v1 = getelementptr i8, i8 addrspace(4)* %base, i64 16
+define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(ptr addrspace(4) inreg %base, i32 inreg %offset,
+                                                    ptr addrspace(1) inreg %out) {
+  %v1 = getelementptr i8, ptr addrspace(4) %base, i64 16
   %v2 = zext i32 %offset to i64
-  %v3 = getelementptr i8, i8 addrspace(4)* %v1, i64 %v2
-  %v4 = bitcast i8 addrspace(4)* %v3 to <2 x i32> addrspace(4)*
-  %v5 = load <2 x i32>, <2 x i32> addrspace(4)* %v4, align 4
-  store <2 x i32> %v5, <2 x i32> addrspace(1)* %out, align 4
+  %v3 = getelementptr i8, ptr addrspace(4) %v1, i64 %v2
+  %v5 = load <2 x i32>, ptr addrspace(4) %v3, align 4
+  store <2 x i32> %v5, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -103,10 +100,10 @@ define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(i8 addrspace(4)* inreg %base
 ; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr4
 ; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
 ; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OFFSET]], 77,
-define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> inreg %base, i32 inreg %i, i32 addrspace(1)* inreg %out) {
+define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> inreg %base, i32 inreg %i, ptr addrspace(1) inreg %out) {
   %off = add nuw nsw i32 %i, 77
   %v = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %base, i32 %off, i32 0)
-  store i32 %v, i32 addrspace(1)* %out, align 4
+  store i32 %v, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -127,11 +124,11 @@ define amdgpu_cs void @test_buffer_load_sgpr_plus_imm_offset(<4 x i32> inreg %ba
 ; GISEL-DAG: %[[SHIFT:.*]]:sreg_32 = S_LSHL_B32 %[[INDEX]],
 ; GISEL-DAG: %[[BASE:.*]]:sgpr_128 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1, %[[BASE2]], %subreg.sub2, %[[BASE3]], %subreg.sub3
 ; GISEL: S_BUFFER_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[SHIFT]], 5,
-define amdgpu_cs void @test_buffer_load_sgpr_or_imm_offset(<4 x i32> inreg %base, i32 inreg %i, i32 addrspace(1)* inreg %out) {
+define amdgpu_cs void @test_buffer_load_sgpr_or_imm_offset(<4 x i32> inreg %base, i32 inreg %i, ptr addrspace(1) inreg %out) {
   %shift = shl i32 %i, 7
   %off = or i32 %shift, 5
   %v = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %base, i32 %off, i32 0)
-  store i32 %v, i32 addrspace(1)* %out, align 4
+  store i32 %v, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
index e5de201bedf8..9f5b6389ab59 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s
 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 -early-live-intervals -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
 
-define weak_odr amdgpu_kernel void @test_mul24_knownbits_kernel(float addrspace(1)* %p) #4 {
+define weak_odr amdgpu_kernel void @test_mul24_knownbits_kernel(ptr addrspace(1) %p) #4 {
 ; GCN-LABEL: test_mul24_knownbits_kernel:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
@@ -24,8 +24,8 @@ entry:
   %1 = mul nsw i32 %tid, -5
   %v1 = and i32 %1, -32
   %v2 = sext i32 %v1 to i64
-  %v3 = getelementptr inbounds float, float addrspace(1)* %p, i64 %v2
-  store float 0.000, float addrspace(1)* %v3, align 4
+  %v3 = getelementptr inbounds float, ptr addrspace(1) %p, i64 %v2
+  store float 0.000, ptr addrspace(1) %v3, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-unroll-threshold.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-unroll-threshold.ll
index 04c196b8b623..f48ff77abf9a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-unroll-threshold.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-unroll-threshold.ll
@@ -14,8 +14,8 @@
 ; CHECK-NOT: br i1 %cmp
 ; CHECK: ret void
 
- at in = internal unnamed_addr global i32* null, align 8
- at out = internal unnamed_addr global i32* null, align 8
+ at in = internal unnamed_addr global ptr null, align 8
+ at out = internal unnamed_addr global ptr null, align 8
 
 define void @unroll_default() {
 entry:
@@ -23,8 +23,8 @@ entry:
 
 do.body:                                          ; preds = %entry
   %i.0 = phi i32 [ 0, %entry ], [ %inc, %do.body ]
-  %v1 = load i64, i64* bitcast (i32** @in to i64*), align 8
-  store i64 %v1, i64* bitcast (i32** @out to i64*), align 8
+  %v1 = load i64, ptr @in, align 8
+  store i64 %v1, ptr @out, align 8
   %inc = add nsw i32 %i.0, 1
   %cmp = icmp slt i32 %inc, 100
   br i1 %cmp, label %do.body, label %do.end
@@ -39,8 +39,8 @@ entry:
 
 do.body:                                          ; preds = %entry
   %i.0 = phi i32 [ 0, %entry ], [ %inc, %do.body ]
-  %v1 = load i64, i64* bitcast (i32** @in to i64*), align 8
-  store i64 %v1, i64* bitcast (i32** @out to i64*), align 8
+  %v1 = load i64, ptr @in, align 8
+  store i64 %v1, ptr @out, align 8
   %inc = add nsw i32 %i.0, 1
   %cmp = icmp slt i32 %inc, 100
   br i1 %cmp, label %do.body, label %do.end

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
index e6bc9a7d697a..285b027c6788 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
@@ -16,10 +16,10 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
-define amdgpu_kernel void @ngroups_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_x (ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
-  store i32 %0, i32 addrspace(1)* %out
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -31,10 +31,10 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
-define amdgpu_kernel void @ngroups_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_y (ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
-  store i32 %0, i32 addrspace(1)* %out
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -46,10 +46,10 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
-define amdgpu_kernel void @ngroups_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_z (ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
-  store i32 %0, i32 addrspace(1)* %out
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -61,10 +61,10 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
-define amdgpu_kernel void @global_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_x (ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
-  store i32 %0, i32 addrspace(1)* %out
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -76,10 +76,10 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
-define amdgpu_kernel void @global_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_y (ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
-  store i32 %0, i32 addrspace(1)* %out
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -91,10 +91,10 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
-define amdgpu_kernel void @global_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_z (ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
-  store i32 %0, i32 addrspace(1)* %out
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -106,10 +106,10 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z
-define amdgpu_kernel void @local_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x (ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
-  store i32 %0, i32 addrspace(1)* %out
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -121,10 +121,10 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].W
-define amdgpu_kernel void @local_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y (ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
-  store i32 %0, i32 addrspace(1)* %out
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 
@@ -136,10 +136,10 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[2].X
-define amdgpu_kernel void @local_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z (ptr addrspace(1) %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
-  store i32 %0, i32 addrspace(1)* %out
+  store i32 %0, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll b/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll
index 04fbe2ae1f94..c2b5e6867ae7 100644
--- a/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll
+++ b/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll
@@ -7,7 +7,7 @@
 ; CHECK: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, __unnamed_1
 ; CHECK: s_endpgm
 define amdgpu_kernel void @test() {
-  store i32 1, i32 addrspace(1)* @0
+  store i32 1, ptr addrspace(1) @0
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/are-loads-from-same-base-ptr.ll b/llvm/test/CodeGen/AMDGPU/are-loads-from-same-base-ptr.ll
index 6d26f571c726..f15435d9e200 100644
--- a/llvm/test/CodeGen/AMDGPU/are-loads-from-same-base-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/are-loads-from-same-base-ptr.ll
@@ -7,10 +7,10 @@
 ; GCN: global_load_dword
 ; GCN: ds_min_u32
 ; GCN: ds_max_u32
-define amdgpu_kernel void @are_loads_from_same_base_ptr_ds_atomic(i32 addrspace(1)* %arg0, i32 addrspace(3)* noalias %ptr0) #0 {
-  %tmp1 = load volatile i32, i32 addrspace(1)* %arg0
-  %tmp2 = atomicrmw umin i32 addrspace(3)* %ptr0, i32 %tmp1 seq_cst
-  %tmp3 = atomicrmw umax i32 addrspace(3)* %ptr0, i32 %tmp1 seq_cst
+define amdgpu_kernel void @are_loads_from_same_base_ptr_ds_atomic(ptr addrspace(1) %arg0, ptr addrspace(3) noalias %ptr0) #0 {
+  %tmp1 = load volatile i32, ptr addrspace(1) %arg0
+  %tmp2 = atomicrmw umin ptr addrspace(3) %ptr0, i32 %tmp1 seq_cst
+  %tmp3 = atomicrmw umax ptr addrspace(3) %ptr0, i32 %tmp1 seq_cst
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index f5505c97ebd5..14b0e0ff47d4 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -20,26 +20,26 @@ declare void @llvm.amdgcn.s.barrier() #2
 ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
 ; alloca to a vector.  It currently fails because it does not know how
 ; to interpret:
-; getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 1, i32 %b
+; getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b
 
 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
-define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
+define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) #0 {
   %alloca = alloca [16 x i32], align 16, addrspace(5)
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
-  %a_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inA, i32 %tid
-  %b_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inB, i32 %tid
-  %a = load i32, i32 addrspace(1)* %a_ptr, !range !0
-  %b = load i32, i32 addrspace(1)* %b_ptr, !range !0
+  %a_ptr = getelementptr inbounds i32, ptr addrspace(1) %inA, i32 %tid
+  %b_ptr = getelementptr inbounds i32, ptr addrspace(1) %inB, i32 %tid
+  %a = load i32, ptr addrspace(1) %a_ptr, !range !0
+  %b = load i32, ptr addrspace(1) %b_ptr, !range !0
   %result = add i32 %a, %b
-  %alloca_ptr = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 1, i32 %b
-  store i32 %result, i32 addrspace(5)* %alloca_ptr, align 4
+  %alloca_ptr = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b
+  store i32 %result, ptr addrspace(5) %alloca_ptr, align 4
   ; Dummy call
   call void @llvm.amdgcn.s.barrier()
-  %reload = load i32, i32 addrspace(5)* %alloca_ptr, align 4, !range !0
-  %out_ptr = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
-  store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
+  %reload = load i32, ptr addrspace(5) %alloca_ptr, align 4, !range !0
+  %out_ptr = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
+  store i32 %reload, ptr addrspace(1) %out_ptr, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
index 3be0f3a74e5f..d97f545a0baa 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -7,15 +7,15 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 ; SI-DAG: v_mul_u32_u24
 ; SI-DAG: v_mul_hi_u32_u24
 ; SI: s_endpgm
-define amdgpu_kernel void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
-  %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid
-  %a = load i32, i32 addrspace(1)* %a_ptr
-  %b = load i32, i32 addrspace(1)* %b_ptr
+  %a_ptr = getelementptr [1025 x i32], ptr addrspace(1) %inA, i32 %tid, i32 0
+  %b_ptr = getelementptr i32, ptr addrspace(1) %inB, i32 %tid
+  %a = load i32, ptr addrspace(1) %a_ptr
+  %b = load i32, ptr addrspace(1) %b_ptr
   %result = add i32 %a, %b
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 1edec164ef26..7da058ca6ee7 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -121,69 +121,69 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
 ; CHECK: NumSGPRsForWavesPerEU: 12
 ; CHECK: NumVGPRsForWavesPerEU: 24
 define amdgpu_kernel void @exactly_10() #9 {
-  %val0 = load volatile float, float addrspace(1)* @var
-  %val1 = load volatile float, float addrspace(1)* @var
-  %val2 = load volatile float, float addrspace(1)* @var
-  %val3 = load volatile float, float addrspace(1)* @var
-  %val4 = load volatile float, float addrspace(1)* @var
-  %val5 = load volatile float, float addrspace(1)* @var
-  %val6 = load volatile float, float addrspace(1)* @var
-  %val7 = load volatile float, float addrspace(1)* @var
-  %val8 = load volatile float, float addrspace(1)* @var
-  %val9 = load volatile float, float addrspace(1)* @var
-  %val10 = load volatile float, float addrspace(1)* @var
-  %val11 = load volatile float, float addrspace(1)* @var
-  %val12 = load volatile float, float addrspace(1)* @var
-  %val13 = load volatile float, float addrspace(1)* @var
-  %val14 = load volatile float, float addrspace(1)* @var
-  %val15 = load volatile float, float addrspace(1)* @var
-  %val16 = load volatile float, float addrspace(1)* @var
-  %val17 = load volatile float, float addrspace(1)* @var
-  %val18 = load volatile float, float addrspace(1)* @var
-  %val19 = load volatile float, float addrspace(1)* @var
-  %val20 = load volatile float, float addrspace(1)* @var
-  %val21 = load volatile float, float addrspace(1)* @var
-  %val22 = load volatile float, float addrspace(1)* @var
-  %val23 = load volatile float, float addrspace(1)* @var
-  %val24 = load volatile float, float addrspace(1)* @var
-  %val25 = load volatile float, float addrspace(1)* @var
-  %val26 = load volatile float, float addrspace(1)* @var
-  %val27 = load volatile float, float addrspace(1)* @var
-  %val28 = load volatile float, float addrspace(1)* @var
-  %val29 = load volatile float, float addrspace(1)* @var
-  %val30 = load volatile float, float addrspace(1)* @var
-
-  store volatile float %val0, float addrspace(1)* @var
-  store volatile float %val1, float addrspace(1)* @var
-  store volatile float %val2, float addrspace(1)* @var
-  store volatile float %val3, float addrspace(1)* @var
-  store volatile float %val4, float addrspace(1)* @var
-  store volatile float %val5, float addrspace(1)* @var
-  store volatile float %val6, float addrspace(1)* @var
-  store volatile float %val7, float addrspace(1)* @var
-  store volatile float %val8, float addrspace(1)* @var
-  store volatile float %val9, float addrspace(1)* @var
-  store volatile float %val10, float addrspace(1)* @var
-  store volatile float %val11, float addrspace(1)* @var
-  store volatile float %val12, float addrspace(1)* @var
-  store volatile float %val13, float addrspace(1)* @var
-  store volatile float %val14, float addrspace(1)* @var
-  store volatile float %val15, float addrspace(1)* @var
-  store volatile float %val16, float addrspace(1)* @var
-  store volatile float %val17, float addrspace(1)* @var
-  store volatile float %val18, float addrspace(1)* @var
-  store volatile float %val19, float addrspace(1)* @var
-  store volatile float %val20, float addrspace(1)* @var
-  store volatile float %val21, float addrspace(1)* @var
-  store volatile float %val22, float addrspace(1)* @var
-  store volatile float %val23, float addrspace(1)* @var
-  store volatile float %val24, float addrspace(1)* @var
-  store volatile float %val25, float addrspace(1)* @var
-  store volatile float %val26, float addrspace(1)* @var
-  store volatile float %val27, float addrspace(1)* @var
-  store volatile float %val28, float addrspace(1)* @var
-  store volatile float %val29, float addrspace(1)* @var
-  store volatile float %val30, float addrspace(1)* @var
+  %val0 = load volatile float, ptr addrspace(1) @var
+  %val1 = load volatile float, ptr addrspace(1) @var
+  %val2 = load volatile float, ptr addrspace(1) @var
+  %val3 = load volatile float, ptr addrspace(1) @var
+  %val4 = load volatile float, ptr addrspace(1) @var
+  %val5 = load volatile float, ptr addrspace(1) @var
+  %val6 = load volatile float, ptr addrspace(1) @var
+  %val7 = load volatile float, ptr addrspace(1) @var
+  %val8 = load volatile float, ptr addrspace(1) @var
+  %val9 = load volatile float, ptr addrspace(1) @var
+  %val10 = load volatile float, ptr addrspace(1) @var
+  %val11 = load volatile float, ptr addrspace(1) @var
+  %val12 = load volatile float, ptr addrspace(1) @var
+  %val13 = load volatile float, ptr addrspace(1) @var
+  %val14 = load volatile float, ptr addrspace(1) @var
+  %val15 = load volatile float, ptr addrspace(1) @var
+  %val16 = load volatile float, ptr addrspace(1) @var
+  %val17 = load volatile float, ptr addrspace(1) @var
+  %val18 = load volatile float, ptr addrspace(1) @var
+  %val19 = load volatile float, ptr addrspace(1) @var
+  %val20 = load volatile float, ptr addrspace(1) @var
+  %val21 = load volatile float, ptr addrspace(1) @var
+  %val22 = load volatile float, ptr addrspace(1) @var
+  %val23 = load volatile float, ptr addrspace(1) @var
+  %val24 = load volatile float, ptr addrspace(1) @var
+  %val25 = load volatile float, ptr addrspace(1) @var
+  %val26 = load volatile float, ptr addrspace(1) @var
+  %val27 = load volatile float, ptr addrspace(1) @var
+  %val28 = load volatile float, ptr addrspace(1) @var
+  %val29 = load volatile float, ptr addrspace(1) @var
+  %val30 = load volatile float, ptr addrspace(1) @var
+
+  store volatile float %val0, ptr addrspace(1) @var
+  store volatile float %val1, ptr addrspace(1) @var
+  store volatile float %val2, ptr addrspace(1) @var
+  store volatile float %val3, ptr addrspace(1) @var
+  store volatile float %val4, ptr addrspace(1) @var
+  store volatile float %val5, ptr addrspace(1) @var
+  store volatile float %val6, ptr addrspace(1) @var
+  store volatile float %val7, ptr addrspace(1) @var
+  store volatile float %val8, ptr addrspace(1) @var
+  store volatile float %val9, ptr addrspace(1) @var
+  store volatile float %val10, ptr addrspace(1) @var
+  store volatile float %val11, ptr addrspace(1) @var
+  store volatile float %val12, ptr addrspace(1) @var
+  store volatile float %val13, ptr addrspace(1) @var
+  store volatile float %val14, ptr addrspace(1) @var
+  store volatile float %val15, ptr addrspace(1) @var
+  store volatile float %val16, ptr addrspace(1) @var
+  store volatile float %val17, ptr addrspace(1) @var
+  store volatile float %val18, ptr addrspace(1) @var
+  store volatile float %val19, ptr addrspace(1) @var
+  store volatile float %val20, ptr addrspace(1) @var
+  store volatile float %val21, ptr addrspace(1) @var
+  store volatile float %val22, ptr addrspace(1) @var
+  store volatile float %val23, ptr addrspace(1) @var
+  store volatile float %val24, ptr addrspace(1) @var
+  store volatile float %val25, ptr addrspace(1) @var
+  store volatile float %val26, ptr addrspace(1) @var
+  store volatile float %val27, ptr addrspace(1) @var
+  store volatile float %val28, ptr addrspace(1) @var
+  store volatile float %val29, ptr addrspace(1) @var
+  store volatile float %val30, ptr addrspace(1) @var
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
index 337dcfc652bd..71c7c8f52dc1 100644
--- a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
@@ -8,7 +8,7 @@
 ; Subtargets must wait for outstanding memory instructions before a barrier if
 ; they cannot back off of the barrier.
 
-define void @back_off_barrier_no_fence(i32* %in, i32* %out) #0 {
+define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 {
 ; GFX9-NO-BACKOFF-LABEL: back_off_barrier_no_fence:
 ; GFX9-NO-BACKOFF:       ; %bb.0:
 ; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -40,13 +40,13 @@ define void @back_off_barrier_no_fence(i32* %in, i32* %out) #0 {
 ; GFX10-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
-  %load = load i32, i32* %in
+  %load = load i32, ptr %in
   call void @llvm.amdgcn.s.barrier()
-  store i32 %load, i32* %out
+  store i32 %load, ptr %out
   ret void
 }
 
-define void @back_off_barrier_with_fence(i32* %in, i32* %out) #0 {
+define void @back_off_barrier_with_fence(ptr %in, ptr %out) #0 {
 ; GFX9-NO-BACKOFF-LABEL: back_off_barrier_with_fence:
 ; GFX9-NO-BACKOFF:       ; %bb.0:
 ; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -84,11 +84,11 @@ define void @back_off_barrier_with_fence(i32* %in, i32* %out) #0 {
 ; GFX10-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
-  %load = load i32, i32* %in
+  %load = load i32, ptr %in
   fence syncscope("workgroup") release
   call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  store i32 %load, i32* %out
+  store i32 %load, ptr %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
index 92bac604a6b6..bbe30fb235fb 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck %s --check-prefixes=VI-SDWA
 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck %s --check-prefixes=CI
 
-define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x) {
+define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) {
 ; VI-LABEL: bfe_combine8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -62,13 +62,13 @@ define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x
   %idx = add i32 %x, %id
   %srl = lshr i32 %idx, 8
   %and = and i32 %srl, 255
-  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and
-  %val = load i32, i32 addrspace(1)* %ptr, align 4
-  store i32 %val, i32 addrspace(1)* %arg, align 4
+  %ptr = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %and
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  store i32 %val, ptr addrspace(1) %arg, align 4
   ret void
 }
 
-define amdgpu_kernel void @bfe_combine16(i32 addrspace(1)* nocapture %arg, i32 %x) {
+define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x) {
 ; VI-LABEL: bfe_combine16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -132,9 +132,9 @@ define amdgpu_kernel void @bfe_combine16(i32 addrspace(1)* nocapture %arg, i32 %
   %idx = add i32 %x, %id
   %srl = lshr i32 %idx, 1
   %and = and i32 %srl, 2147450880
-  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and
-  %val = load i32, i32 addrspace(1)* %ptr, align 4
-  store i32 %val, i32 addrspace(1)* %arg, align 4
+  %ptr = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %and
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  store i32 %val, ptr addrspace(1) %arg, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
index 1fedb30e31fc..1f0e09371d6d 100644
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -23,20 +23,18 @@
 
 ; GCN: [[BB1]]
 ; GCN: s_or_b64 exec, exec
-define hidden void @void_func_byval_struct_use_outside_entry_block(%struct.ByValStruct addrspace(5)* byval(%struct.ByValStruct) noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval(%struct.ByValStruct) noalias nocapture align 4 %arg1, i1 %cond) #1 {
+define hidden void @void_func_byval_struct_use_outside_entry_block(ptr addrspace(5) byval(%struct.ByValStruct) noalias nocapture align 4 %arg0, ptr addrspace(5) byval(%struct.ByValStruct) noalias nocapture align 4 %arg1, i1 %cond) #1 {
 entry:
   br i1 %cond, label %bb0, label %bb1
 
 bb0:
-  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
-  %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4
+  %tmp = load volatile i32, ptr addrspace(5) %arg0, align 4
   %add = add nsw i32 %tmp, 1
-  store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
-  %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4
+  store volatile i32 %add, ptr addrspace(5) %arg0, align 4
+  %tmp1 = load volatile i32, ptr addrspace(5) %arg1, align 4
   %add3 = add nsw i32 %tmp1, 2
-  store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4
-  store volatile i32 9, i32 addrspace(1)* null, align 4
+  store volatile i32 %add3, ptr addrspace(5) %arg1, align 4
+  store volatile i32 9, ptr addrspace(1) null, align 4
   br label %bb1
 
 bb1:
@@ -44,8 +42,8 @@ bb1:
 }
 declare hidden void @external_void_func_void() #0
 
-declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #3
-declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #3
+declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #3
+declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #3
 
 attributes #0 = { nounwind }
 attributes #1 = { noinline norecurse nounwind }

diff  --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index cd0a39856abc..38fd778fba68 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -3,12 +3,12 @@
 ; GCN-LABEL: {{^}}store_fi_lifetime:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define amdgpu_kernel void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @store_fi_lifetime(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
   %b = alloca i8, addrspace(5)
-  call void @llvm.lifetime.start.p5i8(i64 1, i8 addrspace(5)* %b)
-  store volatile i8 addrspace(5)* %b, i8 addrspace(5)* addrspace(1)* undef
-  call void @llvm.lifetime.end.p5i8(i64 1, i8 addrspace(5)* %b)
+  call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) %b)
+  store volatile ptr addrspace(5) %b, ptr addrspace(1) undef
+  call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) %b)
   ret void
 }
 
@@ -18,10 +18,10 @@ entry:
 ; GCN: buffer_store_dword v{{[0-9]+}}, off,
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
-define amdgpu_kernel void @stored_fi_to_lds(float addrspace(5)* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds(ptr addrspace(3) %ptr) #0 {
   %tmp = alloca float, addrspace(5)
-  store float 4.0, float  addrspace(5)*%tmp
-  store float addrspace(5)* %tmp, float addrspace(5)* addrspace(3)* %ptr
+  store float 4.0, ptr  addrspace(5) %tmp
+  store ptr addrspace(5) %tmp, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -38,13 +38,13 @@ define amdgpu_kernel void @stored_fi_to_lds(float addrspace(5)* addrspace(3)* %p
 
 ; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
 ; GCN: ds_write_b32  [[VLDSPTR]], [[FI1]]
-define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float addrspace(5)* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(ptr addrspace(3) %ptr) #0 {
   %tmp0 = alloca float, addrspace(5)
   %tmp1 = alloca float, addrspace(5)
-  store float 4.0, float addrspace(5)* %tmp0
-  store float 4.0, float addrspace(5)* %tmp1
-  store volatile float addrspace(5)* %tmp0, float addrspace(5)* addrspace(3)* %ptr
-  store volatile float addrspace(5)* %tmp1, float addrspace(5)* addrspace(3)* %ptr
+  store float 4.0, ptr addrspace(5) %tmp0
+  store float 4.0, ptr addrspace(5) %tmp1
+  store volatile ptr addrspace(5) %tmp0, ptr addrspace(3) %ptr
+  store volatile ptr addrspace(5) %tmp1, ptr addrspace(3) %ptr
   ret void
 }
 
@@ -55,12 +55,11 @@ define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float addrspace(5)*
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 define amdgpu_kernel void @stored_fi_to_self() #0 {
-  %tmp = alloca i32 addrspace(5)*, addrspace(5)
+  %tmp = alloca ptr addrspace(5), addrspace(5)
 
   ; Avoid optimizing everything out
-  store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp
-  %bitcast = bitcast i32 addrspace(5)* addrspace(5)* %tmp to i32 addrspace(5)*
-  store volatile i32 addrspace(5)* %bitcast, i32 addrspace(5)* addrspace(5)* %tmp
+  store volatile ptr addrspace(5) inttoptr (i32 1234 to ptr addrspace(5)), ptr addrspace(5) %tmp
+  store volatile ptr addrspace(5) %tmp, ptr addrspace(5) %tmp
   ret void
 }
 
@@ -75,16 +74,14 @@ define amdgpu_kernel void @stored_fi_to_self() #0 {
 ; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}}
 define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
   %tmp0 = alloca [512 x i32], addrspace(5)
-  %tmp1 = alloca i32 addrspace(5)*, addrspace(5)
+  %tmp1 = alloca ptr addrspace(5), addrspace(5)
 
   ; Avoid optimizing everything out
-  %tmp0.cast = bitcast [512 x i32] addrspace(5)* %tmp0 to i32 addrspace(5)*
-  store volatile i32 32, i32 addrspace(5)* %tmp0.cast
+  store volatile i32 32, ptr addrspace(5) %tmp0
 
-  store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp1
+  store volatile ptr addrspace(5) inttoptr (i32 1234 to ptr addrspace(5)), ptr addrspace(5) %tmp1
 
-  %bitcast = bitcast i32 addrspace(5)* addrspace(5)* %tmp1 to i32 addrspace(5)*
-  store volatile i32 addrspace(5)* %bitcast, i32 addrspace(5)* addrspace(5)* %tmp1
+  store volatile ptr addrspace(5) %tmp1, ptr addrspace(5) %tmp1
   ret void
 }
 
@@ -99,18 +96,16 @@ define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
 ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}}
 define amdgpu_kernel void @stored_fi_to_fi() #0 {
-  %tmp0 = alloca i32 addrspace(5)*, addrspace(5)
-  %tmp1 = alloca i32 addrspace(5)*, addrspace(5)
-  %tmp2 = alloca i32 addrspace(5)*, addrspace(5)
-  store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp0
-  store volatile i32 addrspace(5)* inttoptr (i32 5678 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp1
-  store volatile i32 addrspace(5)* inttoptr (i32 9999 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp2
-
-  %bitcast1 = bitcast i32 addrspace(5)* addrspace(5)* %tmp1 to i32 addrspace(5)*
-  %bitcast2 = bitcast i32 addrspace(5)* addrspace(5)* %tmp2 to i32 addrspace(5)* ;  at offset 8
-
-  store volatile i32 addrspace(5)* %bitcast1, i32 addrspace(5)* addrspace(5)* %tmp2 ; store offset 4 at offset 8
-  store volatile i32 addrspace(5)* %bitcast2, i32 addrspace(5)* addrspace(5)* %tmp1 ; store offset 8 at offset 4
+  %tmp0 = alloca ptr addrspace(5), addrspace(5)
+  %tmp1 = alloca ptr addrspace(5), addrspace(5)
+  %tmp2 = alloca ptr addrspace(5), addrspace(5)
+  store volatile ptr addrspace(5) inttoptr (i32 1234 to ptr addrspace(5)), ptr addrspace(5) %tmp0
+  store volatile ptr addrspace(5) inttoptr (i32 5678 to ptr addrspace(5)), ptr addrspace(5) %tmp1
+  store volatile ptr addrspace(5) inttoptr (i32 9999 to ptr addrspace(5)), ptr addrspace(5) %tmp2
+
+
+  store volatile ptr addrspace(5) %tmp1, ptr addrspace(5) %tmp2 ; store offset 4 at offset 8
+  store volatile ptr addrspace(5) %tmp2, ptr addrspace(5) %tmp1 ; store offset 8 at offset 4
   ret void
 }
 
@@ -118,10 +113,10 @@ define amdgpu_kernel void @stored_fi_to_fi() #0 {
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define amdgpu_kernel void @stored_fi_to_global(float addrspace(5)* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global(ptr addrspace(1) %ptr) #0 {
   %tmp = alloca float, addrspace(5)
-  store float 0.0, float  addrspace(5)*%tmp
-  store float addrspace(5)* %tmp, float addrspace(5)* addrspace(1)* %ptr
+  store float 0.0, ptr  addrspace(5) %tmp
+  store ptr addrspace(5) %tmp, ptr addrspace(1) %ptr
   ret void
 }
 
@@ -136,15 +131,15 @@ define amdgpu_kernel void @stored_fi_to_global(float addrspace(5)* addrspace(1)*
 
 ; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float addrspace(5)* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_2_small_objects(ptr addrspace(1) %ptr) #0 {
   %tmp0 = alloca float, addrspace(5)
   %tmp1 = alloca float, addrspace(5)
   %tmp2 = alloca float, addrspace(5)
-  store volatile float 0.0, float  addrspace(5)*%tmp0
-  store volatile float 0.0, float  addrspace(5)*%tmp1
-  store volatile float 0.0, float  addrspace(5)*%tmp2
-  store volatile float addrspace(5)* %tmp1, float addrspace(5)* addrspace(1)* %ptr
-  store volatile float addrspace(5)* %tmp2, float addrspace(5)* addrspace(1)* %ptr
+  store volatile float 0.0, ptr  addrspace(5) %tmp0
+  store volatile float 0.0, ptr  addrspace(5) %tmp1
+  store volatile float 0.0, ptr  addrspace(5) %tmp2
+  store volatile ptr addrspace(5) %tmp1, ptr addrspace(1) %ptr
+  store volatile ptr addrspace(5) %tmp2, ptr addrspace(1) %ptr
   ret void
 }
 
@@ -163,19 +158,18 @@ define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float addrspace(5
 ; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 
 ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32 addrspace(5)* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(ptr addrspace(1) %ptr) #0 {
   %tmp0 = alloca [4096 x i32], addrspace(5)
   %tmp1 = alloca [4096 x i32], addrspace(5)
-  %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32] addrspace(5)* %tmp0, i32 0, i32 0
-  store volatile i32 0, i32 addrspace(5)* %gep0.tmp0
-  %gep1.tmp0 = getelementptr [4096 x i32], [4096 x i32] addrspace(5)* %tmp0, i32 0, i32 4095
-  store volatile i32 999, i32 addrspace(5)* %gep1.tmp0
-  %gep0.tmp1 = getelementptr [4096 x i32], [4096 x i32] addrspace(5)* %tmp0, i32 0, i32 14
-  store i32 addrspace(5)* %gep0.tmp1, i32 addrspace(5)* addrspace(1)* %ptr
+  store volatile i32 0, ptr addrspace(5) %tmp0
+  %gep1.tmp0 = getelementptr [4096 x i32], ptr addrspace(5) %tmp0, i32 0, i32 4095
+  store volatile i32 999, ptr addrspace(5) %gep1.tmp0
+  %gep0.tmp1 = getelementptr [4096 x i32], ptr addrspace(5) %tmp0, i32 0, i32 14
+  store ptr addrspace(5) %gep0.tmp1, ptr addrspace(1) %ptr
   ret void
 }
 
- at g1 = external addrspace(1) global i32 addrspace(5)*
+ at g1 = external addrspace(1) global ptr addrspace(5)
 
 ; This was leaving a dead node around resulting in failing to select
 ; on the leftover AssertZext's ValueType operand.
@@ -186,18 +180,18 @@ define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32 addrspace(5
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1 at gotpcrel32@hi+12
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define amdgpu_kernel void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
+define amdgpu_kernel void @cannot_select_assertzext_valuetype(ptr addrspace(1) %out, i32 %idx) #0 {
 entry:
   %b = alloca i32, align 4, addrspace(5)
-  %tmp1 = load volatile i32 addrspace(5)*, i32 addrspace(5)* addrspace(1)* @g1, align 4
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(5)* %tmp1, i32 %idx
-  %tmp2 = load i32, i32 addrspace(5)* %arrayidx, align 4
-  store volatile i32 addrspace(5)* %b, i32 addrspace(5)* addrspace(1)* undef
+  %tmp1 = load volatile ptr addrspace(5), ptr addrspace(1) @g1, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(5) %tmp1, i32 %idx
+  %tmp2 = load i32, ptr addrspace(5) %arrayidx, align 4
+  store volatile ptr addrspace(5) %b, ptr addrspace(1) undef
   ret void
 }
 
-declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #1
-declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #1
+declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #1
+declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { argmemonly nounwind }

diff  --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index 3c23aa5cb5c1..83647a04467f 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -61,7 +61,7 @@ define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
 ; GFX1100-NEXT:    s_endpgm
 entry:
   %x = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %x, align 4
+  store volatile i32 0, ptr addrspace(5) %x, align 4
   ret void
 }
 
@@ -243,7 +243,7 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 
 entry:
   %x = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %x, align 4
+  store volatile i32 0, ptr addrspace(5) %x, align 4
   tail call void @ex() #0
   ret void
 }
@@ -314,7 +314,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
 ; GFX1100-NEXT:    s_endpgm
 entry:
   %x = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %x, align 4
+  store volatile i32 0, ptr addrspace(5) %x, align 4
   ret void
 }
 
@@ -521,7 +521,7 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1100-NEXT:    s_endpgm
 entry:
   %x = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %x, align 4
+  store volatile i32 0, ptr addrspace(5) %x, align 4
   tail call void @ex() #2
   ret void
 }
@@ -598,19 +598,18 @@ entry:
   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
   ; fit in the instruction, and has to live in the SGPR offset.
   %alloca = alloca i8, i32 4092, align 4, addrspace(5)
-  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 
-  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
+  %aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
   ; 0x40000 / 64 = 4096 (for wave64)
   ; CHECK: s_add_u32 s6, s7, 0x40000
   ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
-  %a = load volatile i32, i32 addrspace(5)* %aptr
+  %a = load volatile i32, ptr addrspace(5) %aptr
 
   ; Force %a to spill
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
 
-  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
-  store volatile i32 %a, i32 addrspace(5)* %outptr
+  %outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
+  store volatile i32 %a, ptr addrspace(5) %outptr
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
index 26966ff49372..aee6f0e82d25 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
@@ -5,20 +5,20 @@
 
 ; Make sure we match the addressing mode offset of csub intrinsics across blocks.
 
-define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; OPT-LABEL: @test_sink_small_offset_global_atomic_csub_i32(
 ; OPT-NEXT:  entry:
 ; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
 ; OPT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-NEXT:    br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT:       if:
-; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i32 7
-; OPT-NEXT:    [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[IN_GEP]], i32 2)
+; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 7
+; OPT-NEXT:    [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) [[IN_GEP]], i32 2)
 ; OPT-NEXT:    br label [[ENDIF]]
 ; OPT:       endif:
 ; OPT-NEXT:    [[X:%.*]] = phi i32 [ [[VAL]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
-; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999
-; OPT-NEXT:    store i32 [[X]], i32 addrspace(1)* [[OUT_GEP]], align 4
+; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i32 999999
+; OPT-NEXT:    store i32 [[X]], ptr addrspace(1) [[OUT_GEP]], align 4
 ; OPT-NEXT:    br label [[DONE:%.*]]
 ; OPT:       done:
 ; OPT-NEXT:    ret void
@@ -48,21 +48,21 @@ entry:
   br i1 %cmp, label %endif, label %if
 
 if:
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7
-  %val = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %in.gep, i32 2)
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 7
+  %val = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %in.gep, i32 2)
   br label %endif
 
 endif:
   %x = phi i32 [ %val, %if ], [ 0, %entry ]
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999
-  store i32 %x, i32 addrspace(1)* %out.gep
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 999999
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
   ret void
 }
 
-declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #0
+declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #0
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
 
 attributes #0 = { argmemonly nounwind }

diff  --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index 66e4d59fe3d2..494b4b5c48ba 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -4,21 +4,21 @@
 
 ; Make sure we match the addressing mode offset of globla.atomic.fadd intrinsics across blocks.
 
-define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32(
 ; OPT-NEXT:  entry:
 ; OPT-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
 ; OPT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TID]], 0
 ; OPT-NEXT:    br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
 ; OPT:       if:
-; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr float, float addrspace(1)* [[IN:%.*]], i32 7
-; OPT-NEXT:    [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[IN_GEP]], float 2.000000e+00)
-; OPT-NEXT:    [[VAL:%.*]] = load volatile float, float addrspace(1)* undef, align 4
+; OPT-NEXT:    [[IN_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[IN:%.*]], i32 7
+; OPT-NEXT:    [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) [[IN_GEP]], float 2.000000e+00)
+; OPT-NEXT:    [[VAL:%.*]] = load volatile float, ptr addrspace(1) undef, align 4
 ; OPT-NEXT:    br label [[ENDIF]]
 ; OPT:       endif:
 ; OPT-NEXT:    [[X:%.*]] = phi float [ [[VAL]], [[IF]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999
-; OPT-NEXT:    store float [[X]], float addrspace(1)* [[OUT_GEP]], align 4
+; OPT-NEXT:    [[OUT_GEP:%.*]] = getelementptr float, ptr addrspace(1) [[OUT:%.*]], i32 999999
+; OPT-NEXT:    store float [[X]], ptr addrspace(1) [[OUT_GEP]], align 4
 ; OPT-NEXT:    br label [[DONE:%.*]]
 ; OPT:       done:
 ; OPT-NEXT:    ret void
@@ -50,15 +50,15 @@ entry:
   br i1 %cmp, label %endif, label %if
 
 if:
-  %in.gep = getelementptr float, float addrspace(1)* %in, i32 7
-  %fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %in.gep, float 2.0)
-  %val = load volatile float, float addrspace(1)* undef
+  %in.gep = getelementptr float, ptr addrspace(1) %in, i32 7
+  %fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %in.gep, float 2.0)
+  %val = load volatile float, ptr addrspace(1) undef
   br label %endif
 
 endif:
   %x = phi float [ %val, %if ], [ 0.0, %entry ]
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 999999
-  store float %x, float addrspace(1)* %out.gep
+  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 999999
+  store float %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -66,7 +66,7 @@ done:
 }
 
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #2
+declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) nocapture, float) #2
 
 attributes #0 = { argmemonly nounwind }
 attributes #1 = { nounwind readnone willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 1424ea2ca06c..66cfa21b6dbf 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -10,27 +10,27 @@
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
 ; OPT-LABEL: @test_sink_global_small_offset_i32(
-; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in
-; OPT-VI: getelementptr i32, i32 addrspace(1)* %in
+; OPT-CI-NOT: getelementptr i32, ptr addrspace(1) %in
+; OPT-VI: getelementptr i32, ptr addrspace(1) %in
 ; OPT: br i1
 ; OPT-CI: getelementptr i8,
 
 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
-define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i64 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i32, i32 addrspace(1)* %in.gep
+  %tmp1 = load i32, ptr addrspace(1) %in.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -38,7 +38,7 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset(
-; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
+; OPT: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 65535
 ; OPT: br i1
 
 ; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset:
@@ -49,22 +49,22 @@ done:
 ; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
 ; GCN: {{^}}.LBB1_2:
 ; GCN: s_or_b64 exec
-define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
-  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999
+  %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 65535
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp1 = load i8, ptr addrspace(1) %in.gep
   %tmp2 = sext i8 %tmp1 to i32
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -78,22 +78,22 @@ done:
 ; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
 ; GCN: {{^}}.LBB2_2:
 ; GCN: s_or_b64 exec
-define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
-  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024
+  %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 4095
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp1 = load i8, ptr addrspace(1) %in.gep
   %tmp2 = sext i8 %tmp1 to i32
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -107,22 +107,22 @@ done:
 ; GFX9: global_load_sbyte {{v[0-9]+}}, [[VOFFSET]], {{s\[[0-9]+:[0-9]+\]$}}
 ; GCN: {{^}}.LBB3_2:
 ; GCN: s_or_b64 exec
-define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
-  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999
+  %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 4096
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp1 = load i8, ptr addrspace(1) %in.gep
   %tmp2 = sext i8 %tmp1 to i32
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -139,27 +139,27 @@ done:
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092 glc{{$}}
 ; GCN: {{^}}.LBB4_2:
-define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4, addrspace(5)
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999
   %add.arg = add i32 %arg, 8
-  %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1022
+  %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1022
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  store volatile i32 123, i32 addrspace(5)* %alloca.gep
-  %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
+  store volatile i32 123, ptr addrspace(5) %alloca.gep
+  %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep.0
-  %load = load volatile i32, i32 addrspace(5)* %alloca.gep
-  store i32 %load, i32 addrspace(1)* %out.gep.1
+  store i32 %x, ptr addrspace(1) %out.gep.0
+  %load = load volatile i32, ptr addrspace(5) %alloca.gep
+  store i32 %load, ptr addrspace(1) %out.gep.1
   br label %done
 
 done:
@@ -180,27 +180,27 @@ done:
 ; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092 glc{{$}}
 ; GCN: {{^.LBB[0-9]+}}_2:
 
-define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4, addrspace(5)
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999
   %add.arg = add i32 %arg, 8
-  %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1023
+  %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1023
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  store volatile i32 123, i32 addrspace(5)* %alloca.gep
-  %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
+  store volatile i32 123, ptr addrspace(5) %alloca.gep
+  %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep.0
-  %load = load volatile i32, i32 addrspace(5)* %alloca.gep
-  store i32 %load, i32 addrspace(1)* %out.gep.1
+  store i32 %x, ptr addrspace(1) %out.gep.0
+  %load = load volatile i32, ptr addrspace(5) %alloca.gep
+  store i32 %load, ptr addrspace(1) %out.gep.1
   br label %done
 
 done:
@@ -208,7 +208,7 @@ done:
 }
 
 ; OPT-LABEL: @test_no_sink_scratch_large_offset_i32(
-; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024
+; OPT: %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1024
 ; OPT: br i1
 ; OPT-NOT: ptrtoint
 
@@ -217,27 +217,27 @@ done:
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc{{$}}
 ; GCN: {{^.LBB[0-9]+}}_2:
-define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4, addrspace(5)
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i64 999998
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i64 999999
   %add.arg = add i32 %arg, 8
-  %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024
+  %alloca.gep = getelementptr [512 x i32], ptr addrspace(5) %alloca, i32 0, i32 1024
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  store volatile i32 123, i32 addrspace(5)* %alloca.gep
-  %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
+  store volatile i32 123, ptr addrspace(5) %alloca.gep
+  %tmp1 = load volatile i32, ptr addrspace(5) %alloca.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep.0
-  %load = load volatile i32, i32 addrspace(5)* %alloca.gep
-  store i32 %load, i32 addrspace(1)* %out.gep.1
+  store i32 %x, ptr addrspace(1) %out.gep.0
+  %load = load volatile i32, ptr addrspace(5) %alloca.gep
+  store i32 %load, ptr addrspace(1) %out.gep.1
   br label %done
 
 done:
@@ -249,22 +249,22 @@ done:
 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN: {{^.LBB[0-9]+}}_2:
-define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %offset) {
 entry:
   %offset.ext = zext i32 %offset to i64
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i64 %offset.ext
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i32, i32 addrspace(1)* %in.gep
+  %tmp1 = load i32, ptr addrspace(1) %in.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -272,28 +272,28 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_constant_small_offset_i32
-; OPT-NOT:  getelementptr i32, i32 addrspace(4)*
+; OPT-NOT:  getelementptr i32, ptr addrspace(4)
 ; OPT: br i1
 
 ; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
+  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i32, i32 addrspace(4)* %in.gep
+  %tmp1 = load i32, ptr addrspace(4) %in.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -301,28 +301,28 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32
-; OPT-NOT:  getelementptr i32, i32 addrspace(4)*
+; OPT-NOT:  getelementptr i32, ptr addrspace(4)
 ; OPT: br i1
 
 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 255
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
+  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 255
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i32, i32 addrspace(4)* %in.gep
+  %tmp1 = load i32, ptr addrspace(4) %in.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -330,9 +330,9 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32
-; OPT-SI:  getelementptr i32, i32 addrspace(4)*
-; OPT-CI-NOT:  getelementptr i32, i32 addrspace(4)*
-; OPT-VI-NOT:  getelementptr i32, i32 addrspace(4)*
+; OPT-SI:  getelementptr i32, ptr addrspace(4)
+; OPT-CI-NOT:  getelementptr i32, ptr addrspace(4)
+; OPT-VI-NOT:  getelementptr i32, ptr addrspace(4)
 ; OPT: br i1
 
 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32:
@@ -341,21 +341,21 @@ done:
 
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 256
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
+  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 256
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i32, i32 addrspace(4)* %in.gep
+  %tmp1 = load i32, ptr addrspace(4) %in.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -363,8 +363,8 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32
-; OPT-SI: getelementptr i32, i32 addrspace(4)*
-; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
+; OPT-SI: getelementptr i32, ptr addrspace(4)
+; OPT-CI-NOT: getelementptr i32, ptr addrspace(4)
 ; OPT: br i1
 
 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32:
@@ -380,21 +380,21 @@ done:
 ; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffffff{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 4294967295
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
+  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 4294967295
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i32, i32 addrspace(4)* %in.gep
+  %tmp1 = load i32, ptr addrspace(4) %in.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -402,7 +402,7 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32
-; OPT: getelementptr i32, i32 addrspace(4)*
+; OPT: getelementptr i32, ptr addrspace(4)
 ; OPT: br i1
 
 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32:
@@ -411,21 +411,21 @@ done:
 ; GCN: s_addc_u32
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 17179869181
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
+  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 17179869181
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i32, i32 addrspace(4)* %in.gep
+  %tmp1 = load i32, ptr addrspace(4) %in.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -441,21 +441,21 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262143
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
+  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 262143
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i32, i32 addrspace(4)* %in.gep
+  %tmp1 = load i32, ptr addrspace(4) %in.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -463,9 +463,9 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32
-; OPT-SI: getelementptr i32, i32 addrspace(4)*
-; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
-; OPT-VI: getelementptr i32, i32 addrspace(4)*
+; OPT-SI: getelementptr i32, ptr addrspace(4)
+; OPT-CI-NOT: getelementptr i32, ptr addrspace(4)
+; OPT-VI: getelementptr i32, ptr addrspace(4)
 ; OPT: br i1
 
 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32:
@@ -479,21 +479,21 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
-  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262144
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 999999
+  %in.gep = getelementptr i32, ptr addrspace(4) %in, i64 262144
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i32, i32 addrspace(4)* %in.gep
+  %tmp1 = load i32, ptr addrspace(4) %in.gep
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -509,15 +509,15 @@ done:
 ; GCN: s_load_dword [[SREG1:s[0-9]+]],
 ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
 ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
-define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+define amdgpu_kernel void @sink_ds_address(ptr addrspace(3) nocapture %ptr) nounwind {
 entry:
-  %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
-  %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
+  %x = getelementptr inbounds %struct.foo, ptr addrspace(3) %ptr, i32 0, i32 1, i32 0
+  %y = getelementptr inbounds %struct.foo, ptr addrspace(3) %ptr, i32 0, i32 1, i32 2
   br label %bb32
 
 bb32:
-  %a = load float, float addrspace(3)* %x, align 4
-  %b = load float, float addrspace(3)* %y, align 4
+  %a = load float, ptr addrspace(3) %x, align 4
+  %b = load float, ptr addrspace(3) %y, align 4
   %cmp = fcmp one float %a, %b
   br i1 %cmp, label %bb34, label %bb33
 
@@ -535,22 +535,21 @@ bb34:
 ; OPT: br i1 %tmp0,
 ; OPT: if:
 ; OPT: getelementptr i8, {{.*}} 4095
-define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(4)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(ptr addrspace(1) %out, ptr addrspace(4) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
-  %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024
+  %in.gep = getelementptr i8, ptr addrspace(4) %in, i64 4095
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %bitcast = bitcast i8 addrspace(4)* %in.gep to i32 addrspace(4)*
-  %tmp1 = load i32, i32 addrspace(4)* %bitcast, align 1
+  %tmp1 = load i32, ptr addrspace(4) %in.gep, align 1
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -558,25 +557,23 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32(
-; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
-; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
-; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
-; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %1, i32 2 seq_cst
-define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
+; OPT: %tmp1 = atomicrmw add ptr addrspace(3) %sunkaddr, i32 2 seq_cst
+define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
-  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
+  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = atomicrmw add i32 addrspace(3)* %in.gep, i32 2 seq_cst
+  %tmp1 = atomicrmw add ptr addrspace(3) %in.gep, i32 2 seq_cst
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(3)* %out.gep
+  store i32 %x, ptr addrspace(3) %out.gep
   br label %done
 
 done:
@@ -584,26 +581,24 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32(
-; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
-; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
-; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
-; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %1, i32 undef, i32 2 seq_cst monotonic
-define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
+; OPT: %tmp1.struct = cmpxchg ptr addrspace(3) %sunkaddr, i32 undef, i32 2 seq_cst monotonic
+define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
-  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
+  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1.struct = cmpxchg i32 addrspace(3)* %in.gep, i32 undef, i32 2 seq_cst monotonic
+  %tmp1.struct = cmpxchg ptr addrspace(3) %in.gep, i32 undef, i32 2 seq_cst monotonic
   %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(3)* %out.gep
+  store i32 %x, ptr addrspace(3) %out.gep
   br label %done
 
 done:
@@ -611,25 +606,25 @@ done:
 }
 
 ; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32(
-; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+; OPT: %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
 ; OPT: br i1
-; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
-define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
+; OPT: cmpxchg ptr addrspace(3) undef, ptr addrspace(3) %in.gep, ptr addrspace(3) undef seq_cst monotonic
+define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999
-  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %out.gep = getelementptr ptr addrspace(3), ptr addrspace(3) %out, i32 999999
+  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1.struct = cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
-  %tmp1 = extractvalue { i32 addrspace(3)*, i1 } %tmp1.struct, 0
+  %tmp1.struct = cmpxchg ptr addrspace(3) undef, ptr addrspace(3) %in.gep, ptr addrspace(3) undef seq_cst monotonic
+  %tmp1 = extractvalue { ptr addrspace(3), i1 } %tmp1.struct, 0
   br label %endif
 
 endif:
-  %x = phi i32 addrspace(3)* [ %tmp1, %if ], [ null, %entry ]
-  store i32 addrspace(3)* %x, i32 addrspace(3)* addrspace(3)* %out.gep
+  %x = phi ptr addrspace(3) [ %tmp1, %if ], [ null, %entry ]
+  store ptr addrspace(3) %x, ptr addrspace(3) %out.gep
   br label %done
 
 done:
@@ -637,25 +632,23 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32(
-; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
-; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
-; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
-; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
-define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
+; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %sunkaddr, i32 2, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
-  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
+  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
+  %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %in.gep, i32 2, i32 0, i32 0, i1 false)
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(3)* %out.gep
+  store i32 %x, ptr addrspace(3) %out.gep
   br label %done
 
 done:
@@ -663,25 +656,23 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32(
-; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
-; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
-; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
-; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
-define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
+; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %sunkaddr, i32 2, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
-  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
+  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
+  %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %in.gep, i32 2, i32 0, i32 0, i1 false)
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(3)* %out.gep
+  store i32 %x, ptr addrspace(3) %out.gep
   br label %done
 
 done:
@@ -689,33 +680,33 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset(
-; OPT-SICIVI: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
+; OPT-SICIVI: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4096
 ; OPT-SICIV: br
-; OPT-SICIVI: %tmp1 = load i8, i8 addrspace(1)* %in.gep
+; OPT-SICIVI: %tmp1 = load i8, ptr addrspace(1) %in.gep
 
 ; OPT-GFX9: br
-; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
-; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr
+; OPT-GFX9: %sunkaddr = getelementptr i8, ptr addrspace(1) %in, i64 -4096
+; OPT-GFX9: load i8, ptr addrspace(1) %sunkaddr
 
 ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
 ; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GFX9: global_load_sbyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:-4096{{$}}
-define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
-  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4096
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 1024
+  %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4096
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp1 = load i8, ptr addrspace(1) %in.gep
   %tmp2 = sext i8 %tmp1 to i32
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -723,27 +714,27 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_global_small_min_scratch_global_neg1_offset(
-; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
+; OPT: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4097
 ; OPT: br
-; OPT: load i8, i8 addrspace(1)* %in.gep
+; OPT: load i8, ptr addrspace(1) %in.gep
 
 ; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_neg1_offset:
-define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_min_scratch_global_neg1_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
-  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 -4097
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i64 99999
+  %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4097
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = load i8, i8 addrspace(1)* %in.gep
+  %tmp1 = load i8, ptr addrspace(1) %in.gep
   %tmp2 = sext i8 %tmp1 to i32
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(1)* %out.gep
+  store i32 %x, ptr addrspace(1) %out.gep
   br label %done
 
 done:
@@ -751,25 +742,23 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_small_offset_ds_append(
-; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
-; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
-; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
-; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %1, i1 false)
-define amdgpu_kernel void @test_sink_small_offset_ds_append(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
+; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %sunkaddr, i1 false)
+define amdgpu_kernel void @test_sink_small_offset_ds_append(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
-  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
+  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %in.gep, i1 false)
+  %tmp1 = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %in.gep, i1 false)
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(3)* %out.gep
+  store i32 %x, ptr addrspace(3) %out.gep
   br label %done
 
 done:
@@ -777,25 +766,23 @@ done:
 }
 
 ; OPT-LABEL: @test_sink_small_offset_ds_consume(
-; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
-; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
-; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
-; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %1, i1 false)
-define amdgpu_kernel void @test_sink_small_offset_ds_consume(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
+; OPT: %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %sunkaddr, i1 false)
+define amdgpu_kernel void @test_sink_small_offset_ds_consume(ptr addrspace(3) %out, ptr addrspace(3) %in) {
 entry:
-  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
-  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
+  %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
   br i1 %tmp0, label %endif, label %if
 
 if:
-  %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %in.gep, i1 false)
+  %tmp1 = call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %in.gep, i1 false)
   br label %endif
 
 endif:
   %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
-  store i32 %x, i32 addrspace(3)* %out.gep
+  store i32 %x, ptr addrspace(3) %out.gep
   br label %done
 
 done:
@@ -803,10 +790,10 @@ done:
 }
 
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
-declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #3
-declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #3
+declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) nocapture, i1 immarg) #3
+declare i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) nocapture, i1 immarg) #3
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }

diff  --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index 44e5a0b0000a..2ad47482cedb 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -62,10 +62,9 @@ define <2 x half> @chain_hi_to_lo_private() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1
-  %load_lo = load half, half addrspace(5)* %gep_lo
-  %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0
-  %load_hi = load half, half addrspace(5)* %gep_hi
+  %gep_lo = getelementptr inbounds half, ptr addrspace(5) null, i64 1
+  %load_lo = load half, ptr addrspace(5) %gep_lo
+  %load_hi = load half, ptr addrspace(5) null
 
   %temp = insertelement <2 x half> undef, half %load_lo, i32 0
   %result = insertelement <2 x half> %temp, half %load_hi, i32 1
@@ -73,7 +72,7 @@ bb:
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_private_
diff erent_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) {
+define <2 x half> @chain_hi_to_lo_private_
diff erent_bases(ptr addrspace(5) %base_lo, ptr addrspace(5) %base_hi) {
 ; GFX900-LABEL: chain_hi_to_lo_private_
diff erent_bases:
 ; GFX900:       ; %bb.0: ; %bb
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -122,8 +121,8 @@ define <2 x half> @chain_hi_to_lo_private_
diff erent_bases(half addrspace(5)* %ba
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %load_lo = load half, half addrspace(5)* %base_lo
-  %load_hi = load half, half addrspace(5)* %base_hi
+  %load_lo = load half, ptr addrspace(5) %base_lo
+  %load_hi = load half, ptr addrspace(5) %base_hi
 
   %temp = insertelement <2 x half> undef, half %load_lo, i32 0
   %result = insertelement <2 x half> %temp, half %load_hi, i32 1
@@ -131,7 +130,7 @@ bb:
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) {
+define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) {
 ; GFX900-LABEL: chain_hi_to_lo_arithmatic:
 ; GFX900:       ; %bb.0: ; %bb
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -181,7 +180,7 @@ define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %arith_lo = fadd half %in, 1.0
-  %load_hi = load half, half addrspace(5)* %base
+  %load_hi = load half, ptr addrspace(5) %base
 
   %temp = insertelement <2 x half> undef, half %arith_lo, i32 0
   %result = insertelement <2 x half> %temp, half %load_hi, i32 1
@@ -222,10 +221,9 @@ define <2 x half> @chain_hi_to_lo_group() {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1
-  %load_lo = load half, half addrspace(3)* %gep_lo
-  %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0
-  %load_hi = load half, half addrspace(3)* %gep_hi
+  %gep_lo = getelementptr inbounds half, ptr addrspace(3) null, i64 1
+  %load_lo = load half, ptr addrspace(3) %gep_lo
+  %load_hi = load half, ptr addrspace(3) null
 
   %temp = insertelement <2 x half> undef, half %load_lo, i32 0
   %result = insertelement <2 x half> %temp, half %load_hi, i32 1
@@ -233,7 +231,7 @@ bb:
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_group_
diff erent_bases(half addrspace(3)* %base_lo, half addrspace(3)* %base_hi) {
+define <2 x half> @chain_hi_to_lo_group_
diff erent_bases(ptr addrspace(3) %base_lo, ptr addrspace(3) %base_hi) {
 ; GCN-LABEL: chain_hi_to_lo_group_
diff erent_bases:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -263,8 +261,8 @@ define <2 x half> @chain_hi_to_lo_group_
diff erent_bases(half addrspace(3)* %base
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %load_lo = load half, half addrspace(3)* %base_lo
-  %load_hi = load half, half addrspace(3)* %base_hi
+  %load_lo = load half, ptr addrspace(3) %base_lo
+  %load_hi = load half, ptr addrspace(3) %base_hi
 
   %temp = insertelement <2 x half> undef, half %load_lo, i32 0
   %result = insertelement <2 x half> %temp, half %load_hi, i32 1
@@ -314,10 +312,9 @@ define <2 x half> @chain_hi_to_lo_global() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1
-  %load_lo = load half, half addrspace(1)* %gep_lo
-  %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0
-  %load_hi = load half, half addrspace(1)* %gep_hi
+  %gep_lo = getelementptr inbounds half, ptr addrspace(1) null, i64 1
+  %load_lo = load half, ptr addrspace(1) %gep_lo
+  %load_hi = load half, ptr addrspace(1) null
 
   %temp = insertelement <2 x half> undef, half %load_lo, i32 0
   %result = insertelement <2 x half> %temp, half %load_hi, i32 1
@@ -325,7 +322,7 @@ bb:
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_global_
diff erent_bases(half addrspace(1)* %base_lo, half addrspace(1)* %base_hi) {
+define <2 x half> @chain_hi_to_lo_global_
diff erent_bases(ptr addrspace(1) %base_lo, ptr addrspace(1) %base_hi) {
 ; GCN-LABEL: chain_hi_to_lo_global_
diff erent_bases:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -355,8 +352,8 @@ define <2 x half> @chain_hi_to_lo_global_
diff erent_bases(half addrspace(1)* %bas
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %load_lo = load half, half addrspace(1)* %base_lo
-  %load_hi = load half, half addrspace(1)* %base_hi
+  %load_lo = load half, ptr addrspace(1) %base_lo
+  %load_hi = load half, ptr addrspace(1) %base_hi
 
   %temp = insertelement <2 x half> undef, half %load_lo, i32 0
   %result = insertelement <2 x half> %temp, half %load_hi, i32 1
@@ -406,10 +403,9 @@ define <2 x half> @chain_hi_to_lo_flat() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %gep_lo = getelementptr inbounds half, half* null, i64 1
-  %load_lo = load half, half* %gep_lo
-  %gep_hi = getelementptr inbounds half, half* null, i64 0
-  %load_hi = load half, half* %gep_hi
+  %gep_lo = getelementptr inbounds half, ptr null, i64 1
+  %load_lo = load half, ptr %gep_lo
+  %load_hi = load half, ptr null
 
   %temp = insertelement <2 x half> undef, half %load_lo, i32 0
   %result = insertelement <2 x half> %temp, half %load_hi, i32 1
@@ -417,7 +413,7 @@ bb:
   ret <2 x half> %result
 }
 
-define <2 x half> @chain_hi_to_lo_flat_
diff erent_bases(half* %base_lo, half* %base_hi) {
+define <2 x half> @chain_hi_to_lo_flat_
diff erent_bases(ptr %base_lo, ptr %base_hi) {
 ; GCN-LABEL: chain_hi_to_lo_flat_
diff erent_bases:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -447,8 +443,8 @@ define <2 x half> @chain_hi_to_lo_flat_
diff erent_bases(half* %base_lo, half* %ba
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %load_lo = load half, half* %base_lo
-  %load_hi = load half, half* %base_hi
+  %load_lo = load half, ptr %base_lo
+  %load_hi = load half, ptr %base_hi
 
   %temp = insertelement <2 x half> undef, half %load_lo, i32 0
   %result = insertelement <2 x half> %temp, half %load_hi, i32 1
@@ -457,7 +453,7 @@ bb:
 }
 
 ; Make sure we don't lose any of the private stores.
-define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) #0 {
 ; GFX900-LABEL: vload2_private:
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
@@ -610,33 +606,28 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
 ; GFX11-NEXT:    s_endpgm
 entry:
   %loc = alloca [3 x i16], align 2, addrspace(5)
-  %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
-  %tmp = load i16, i16 addrspace(1)* %in, align 2
-  %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0
-  store volatile i16 %tmp, i16 addrspace(5)* %loc.0.sroa_idx
-  %arrayidx.1 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
-  %tmp1 = load i16, i16 addrspace(1)* %arrayidx.1, align 2
-  %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
-  store volatile i16 %tmp1, i16 addrspace(5)* %loc.2.sroa_idx3
-  %arrayidx.2 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 2
-  %tmp2 = load i16, i16 addrspace(1)* %arrayidx.2, align 2
-  %loc.4.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 2
-  store volatile i16 %tmp2, i16 addrspace(5)* %loc.4.sroa_idx
-  %loc.0.sroa_cast = bitcast [3 x i16] addrspace(5)* %loc to <2 x i16> addrspace(5)*
-  %loc.0. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.0.sroa_cast, align 2
-  store <2 x i16> %loc.0., <2 x i16> addrspace(1)* %out, align 4
-  %loc.2.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
-  %loc.2.sroa_cast = bitcast i16 addrspace(5)* %loc.2.sroa_idx to <2 x i16> addrspace(5)*
-  %loc.2. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.2.sroa_cast, align 2
-  %arrayidx6 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 1
-  store <2 x i16> %loc.2., <2 x i16> addrspace(1)* %arrayidx6, align 4
-  %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
+  %tmp = load i16, ptr addrspace(1) %in, align 2
+  store volatile i16 %tmp, ptr addrspace(5) %loc
+  %arrayidx.1 = getelementptr inbounds i16, ptr addrspace(1) %in, i64 1
+  %tmp1 = load i16, ptr addrspace(1) %arrayidx.1, align 2
+  %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 1
+  store volatile i16 %tmp1, ptr addrspace(5) %loc.2.sroa_idx3
+  %arrayidx.2 = getelementptr inbounds i16, ptr addrspace(1) %in, i64 2
+  %tmp2 = load i16, ptr addrspace(1) %arrayidx.2, align 2
+  %loc.4.sroa_idx = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 2
+  store volatile i16 %tmp2, ptr addrspace(5) %loc.4.sroa_idx
+  %loc.0. = load <2 x i16>, ptr addrspace(5) %loc, align 2
+  store <2 x i16> %loc.0., ptr addrspace(1) %out, align 4
+  %loc.2.sroa_idx = getelementptr inbounds [3 x i16], ptr addrspace(5) %loc, i32 0, i32 1
+  %loc.2. = load <2 x i16>, ptr addrspace(5) %loc.2.sroa_idx, align 2
+  %arrayidx6 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 1
+  store <2 x i16> %loc.2., ptr addrspace(1) %arrayidx6, align 4
   ret void
 }
 
 ; There is another instruction between the misordered instruction and
 ; the value dependent load, so a simple operand check is insufficient.
-define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) {
+define <2 x i16> @chain_hi_to_lo_group_other_dep(ptr addrspace(3) %ptr) {
 ; GCN-LABEL: chain_hi_to_lo_group_other_dep:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -672,10 +663,9 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
-  %load_lo = load i16, i16 addrspace(3)* %gep_lo
-  %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
-  %load_hi = load i16, i16 addrspace(3)* %gep_hi
+  %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
+  %load_lo = load i16, ptr addrspace(3) %gep_lo
+  %load_hi = load i16, ptr addrspace(3) %ptr
   %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
   %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
   %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
@@ -683,7 +673,7 @@ bb:
 }
 
 ; The volatile operations aren't put on the same chain
-define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
+define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %ptr) {
 ; GFX900-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
 ; GFX900:       ; %bb.0: ; %bb
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -729,17 +719,16 @@ define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
-  %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo
-  %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
-  %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi
+  %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
+  %load_lo = load volatile i16, ptr addrspace(3) %gep_lo
+  %load_hi = load volatile i16, ptr addrspace(3) %ptr
   %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
   %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
   %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
   ret <2 x i16> %result
 }
 
-define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
+define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; GFX900-LABEL: chain_hi_to_lo_private_other_dep:
 ; GFX900:       ; %bb.0: ; %bb
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -798,17 +787,16 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1
-  %load_lo = load i16, i16 addrspace(5)* %gep_lo
-  %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0
-  %load_hi = load i16, i16 addrspace(5)* %gep_hi
+  %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
+  %load_lo = load i16, ptr addrspace(5) %gep_lo
+  %load_hi = load i16, ptr addrspace(5) %ptr
   %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
   %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
   %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
   ret <2 x i16> %result
 }
 
-define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
+define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
 ; GFX900-LABEL: chain_hi_to_lo_global_other_dep:
 ; GFX900:       ; %bb.0: ; %bb
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -858,17 +846,16 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1
-  %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo
-  %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0
-  %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi
+  %gep_lo = getelementptr inbounds i16, ptr addrspace(1) %ptr, i64 1
+  %load_lo = load volatile i16, ptr addrspace(1) %gep_lo
+  %load_hi = load volatile i16, ptr addrspace(1) %ptr
   %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
   %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
   %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
   ret <2 x i16> %result
 }
 
-define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
+define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; GFX900-LABEL: chain_hi_to_lo_flat_other_dep:
 ; GFX900:       ; %bb.0: ; %bb
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -922,17 +909,16 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1
-  %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo
-  %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0
-  %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi
+  %gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1
+  %load_lo = load volatile i16, ptr addrspace(0) %gep_lo
+  %load_hi = load volatile i16, ptr addrspace(0) %ptr
   %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
   %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
   %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
   ret <2 x i16> %result
 }
 
-define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) {
+define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, ptr addrspace(3) %may.alias) {
 ; GFX900-LABEL: chain_hi_to_lo_group_may_alias_store:
 ; GFX900:       ; %bb.0: ; %bb
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -981,11 +967,10 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i
 ; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
-  %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
-  %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
-  %load_hi = load i16, i16 addrspace(3)* %gep_hi
-  store i16 123, i16 addrspace(3)* %may.alias
-  %load_lo = load i16, i16 addrspace(3)* %gep_lo
+  %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
+  %load_hi = load i16, ptr addrspace(3) %ptr
+  store i16 123, ptr addrspace(3) %may.alias
+  %load_lo = load i16, ptr addrspace(3) %gep_lo
 
   %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
   %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index bcb3dc21426a..3b65bdf985fa 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -27,7 +27,7 @@
 
 ; DBG-NOT: Cluster ld/st
 
-define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) {
+define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noalias %sb) {
 ; GFX9-LABEL: cluster_load_cluster_store:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -116,23 +116,21 @@ define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noa
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
-  %la0 = getelementptr inbounds i32, i32* %lb, i32 0
-  %ld0 = load i32, i32* %la0
-  %la1 = getelementptr inbounds i32, i32* %lb, i32 2
-  %ld1 = load i32, i32* %la1
-  %la2 = getelementptr inbounds i32, i32* %lb, i32 4
-  %ld2 = load i32, i32* %la2
-  %la3 = getelementptr inbounds i32, i32* %lb, i32 6
-  %ld3 = load i32, i32* %la3
+  %ld0 = load i32, ptr %lb
+  %la1 = getelementptr inbounds i32, ptr %lb, i32 2
+  %ld1 = load i32, ptr %la1
+  %la2 = getelementptr inbounds i32, ptr %lb, i32 4
+  %ld2 = load i32, ptr %la2
+  %la3 = getelementptr inbounds i32, ptr %lb, i32 6
+  %ld3 = load i32, ptr %la3
 
-  %sa0 = getelementptr inbounds i32, i32* %sb, i32 0
-  store i32 %ld0, i32* %sa0
-  %sa1 = getelementptr inbounds i32, i32* %sb, i32 2
-  store i32 %ld1, i32* %sa1
-  %sa2 = getelementptr inbounds i32, i32* %sb, i32 4
-  store i32 %ld2, i32* %sa2
-  %sa3 = getelementptr inbounds i32, i32* %sb, i32 6
-  store i32 %ld3, i32* %sa3
+  store i32 %ld0, ptr %sb
+  %sa1 = getelementptr inbounds i32, ptr %sb, i32 2
+  store i32 %ld1, ptr %sa1
+  %sa2 = getelementptr inbounds i32, ptr %sb, i32 4
+  store i32 %ld2, ptr %sa2
+  %sa3 = getelementptr inbounds i32, ptr %sb, i32 6
+  store i32 %ld3, ptr %sa3
 
   ret void
 }
@@ -155,7 +153,7 @@ bb:
 
 ; DBG-NOT: Cluster ld/st
 
-define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32* noalias %sb) {
+define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr noalias %sb) {
 ; GFX9-LABEL: cluster_load_valu_cluster_store:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -248,24 +246,22 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
-  %la0 = getelementptr inbounds i32, i32* %lb, i32 0
-  %ld0 = load i32, i32* %la0
-  %la1 = getelementptr inbounds i32, i32* %lb, i32 2
-  %ld1 = load i32, i32* %la1
-  %la2 = getelementptr inbounds i32, i32* %lb, i32 4
-  %ld2 = load i32, i32* %la2
-  %la3 = getelementptr inbounds i32, i32* %lb, i32 6
-  %ld3 = load i32, i32* %la3
+  %ld0 = load i32, ptr %lb
+  %la1 = getelementptr inbounds i32, ptr %lb, i32 2
+  %ld1 = load i32, ptr %la1
+  %la2 = getelementptr inbounds i32, ptr %lb, i32 4
+  %ld2 = load i32, ptr %la2
+  %la3 = getelementptr inbounds i32, ptr %lb, i32 6
+  %ld3 = load i32, ptr %la3
 
-  %sa0 = getelementptr inbounds i32, i32* %sb, i32 0
-  store i32 %ld0, i32* %sa0
-  %sa1 = getelementptr inbounds i32, i32* %sb, i32 2
+  store i32 %ld0, ptr %sb
+  %sa1 = getelementptr inbounds i32, ptr %sb, i32 2
   %add = add i32 %ld1, 1
-  store i32 %add, i32* %sa1
-  %sa2 = getelementptr inbounds i32, i32* %sb, i32 4
-  store i32 %ld2, i32* %sa2
-  %sa3 = getelementptr inbounds i32, i32* %sb, i32 6
-  store i32 %ld3, i32* %sa3
+  store i32 %add, ptr %sa1
+  %sa2 = getelementptr inbounds i32, ptr %sb, i32 4
+  store i32 %ld2, ptr %sa2
+  %sa3 = getelementptr inbounds i32, ptr %sb, i32 6
+  store i32 %ld3, ptr %sa3
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
index 6a8a5b633890..a6279dc86fd5 100644
--- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -21,7 +21,7 @@ bb0:
   br i1 %tmp9, label %bb1, label %bb2
 
 bb1:
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %bb2
 
 bb2:
@@ -46,7 +46,7 @@ bb0:
   br i1 %tmp9, label %bb1, label %bb2
 
 bb1:
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %bb2
 
 bb2:

diff  --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index b76956e280c0..1ee9ed211330 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx902  -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
 
-define amdgpu_kernel void @add1(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) {
 ; GCN-LABEL: add1:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -32,16 +32,16 @@ define amdgpu_kernel void @add1(i32 addrspace(1)* nocapture %arg) {
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = zext i1 %cmp to i32
   %add = add i32 %v, %ext
-  store i32 %add, i32 addrspace(1)* %gep, align 4
+  store i32 %add, ptr addrspace(1) %gep, align 4
   ret void
 }
 
-define i16 @add1_i16(i32 addrspace(1)* nocapture %arg, i16 addrspace(1)* nocapture %dst) {
+define i16 @add1_i16(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture %dst) {
 ; GCN-LABEL: add1_i16:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -76,8 +76,8 @@ define i16 @add1_i16(i32 addrspace(1)* nocapture %arg, i16 addrspace(1)* nocaptu
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = zext i1 %cmp to i32
   %add = add i32 %v, %ext
@@ -85,7 +85,7 @@ bb:
   ret i16 %trunc
 }
 
-define amdgpu_kernel void @sub1(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) {
 ; GCN-LABEL: sub1:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -115,16 +115,16 @@ define amdgpu_kernel void @sub1(i32 addrspace(1)* nocapture %arg) {
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = sext i1 %cmp to i32
   %add = add i32 %v, %ext
-  store i32 %add, i32 addrspace(1)* %gep, align 4
+  store i32 %add, ptr addrspace(1) %gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @add_adde(i32 addrspace(1)* nocapture %arg, i32 %a) {
+define amdgpu_kernel void @add_adde(ptr addrspace(1) nocapture %arg, i32 %a) {
 ; GCN-LABEL: add_adde:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -158,17 +158,17 @@ define amdgpu_kernel void @add_adde(i32 addrspace(1)* nocapture %arg, i32 %a) {
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = zext i1 %cmp to i32
   %adde = add i32 %v, %ext
   %add2 = add i32 %adde, %a
-  store i32 %add2, i32 addrspace(1)* %gep, align 4
+  store i32 %add2, ptr addrspace(1) %gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @adde_add(i32 addrspace(1)* nocapture %arg, i32 %a) {
+define amdgpu_kernel void @adde_add(ptr addrspace(1) nocapture %arg, i32 %a) {
 ; GCN-LABEL: adde_add:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -202,17 +202,17 @@ define amdgpu_kernel void @adde_add(i32 addrspace(1)* nocapture %arg, i32 %a) {
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = zext i1 %cmp to i32
   %add = add i32 %v, %a
   %adde = add i32 %add, %ext
-  store i32 %adde, i32 addrspace(1)* %gep, align 4
+  store i32 %adde, ptr addrspace(1) %gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @sub_sube(i32 addrspace(1)* nocapture %arg, i32 %a) {
+define amdgpu_kernel void @sub_sube(ptr addrspace(1) nocapture %arg, i32 %a) {
 ; GCN-LABEL: sub_sube:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -246,17 +246,17 @@ define amdgpu_kernel void @sub_sube(i32 addrspace(1)* nocapture %arg, i32 %a) {
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = sext i1 %cmp to i32
   %adde = add i32 %v, %ext
   %sub = sub i32 %adde, %a
-  store i32 %sub, i32 addrspace(1)* %gep, align 4
+  store i32 %sub, ptr addrspace(1) %gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @sub_sube_commuted(i32 addrspace(1)* nocapture %arg, i32 %a) {
+define amdgpu_kernel void @sub_sube_commuted(ptr addrspace(1) nocapture %arg, i32 %a) {
 ; GCN-LABEL: sub_sube_commuted:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -294,18 +294,18 @@ define amdgpu_kernel void @sub_sube_commuted(i32 addrspace(1)* nocapture %arg, i
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = sext i1 %cmp to i32
   %adde = add i32 %v, %ext
   %sub = sub i32 %adde, %a
   %sub2 = sub i32 100, %sub
-  store i32 %sub2, i32 addrspace(1)* %gep, align 4
+  store i32 %sub2, ptr addrspace(1) %gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @sube_sub(i32 addrspace(1)* nocapture %arg, i32 %a) {
+define amdgpu_kernel void @sube_sub(ptr addrspace(1) nocapture %arg, i32 %a) {
 ; GCN-LABEL: sube_sub:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -339,17 +339,17 @@ define amdgpu_kernel void @sube_sub(i32 addrspace(1)* nocapture %arg, i32 %a) {
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = sext i1 %cmp to i32
   %sub = sub i32 %v, %a
   %adde = add i32 %sub, %ext
-  store i32 %adde, i32 addrspace(1)* %gep, align 4
+  store i32 %adde, ptr addrspace(1) %gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @zext_flclass(i32 addrspace(1)* nocapture %arg, float %x) {
+define amdgpu_kernel void @zext_flclass(ptr addrspace(1) nocapture %arg, float %x) {
 ; GCN-LABEL: zext_flclass:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -382,16 +382,16 @@ define amdgpu_kernel void @zext_flclass(i32 addrspace(1)* nocapture %arg, float
 ; GFX9-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = tail call zeroext i1 @llvm.amdgcn.class.f32(float %x, i32 608)
   %ext = zext i1 %cmp to i32
   %add = add i32 %v, %ext
-  store i32 %add, i32 addrspace(1)* %gep, align 4
+  store i32 %add, ptr addrspace(1) %gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @sext_flclass(i32 addrspace(1)* nocapture %arg, float %x) {
+define amdgpu_kernel void @sext_flclass(ptr addrspace(1) nocapture %arg, float %x) {
 ; GCN-LABEL: sext_flclass:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -424,16 +424,16 @@ define amdgpu_kernel void @sext_flclass(i32 addrspace(1)* nocapture %arg, float
 ; GFX9-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = tail call zeroext i1 @llvm.amdgcn.class.f32(float %x, i32 608)
   %ext = sext i1 %cmp to i32
   %add = add i32 %v, %ext
-  store i32 %add, i32 addrspace(1)* %gep, align 4
+  store i32 %add, ptr addrspace(1) %gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @add_and(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) {
 ; GCN-LABEL: add_and:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -467,19 +467,19 @@ define amdgpu_kernel void @add_and(i32 addrspace(1)* nocapture %arg) {
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp1 = icmp ugt i32 %x, %y
   %cmp2 = icmp ugt i32 %x, 1
   %cmp = and i1 %cmp1, %cmp2
   %ext = zext i1 %cmp to i32
   %add = add i32 %v, %ext
-  store i32 %add, i32 addrspace(1)* %gep, align 4
+  store i32 %add, ptr addrspace(1) %gep, align 4
   ret void
 }
 
 ; sub x, sext (setcc) => addcarry x, 0, setcc
-define amdgpu_kernel void @cmp_sub_sext(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) {
 ; GCN-LABEL: cmp_sub_sext:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -509,17 +509,17 @@ define amdgpu_kernel void @cmp_sub_sext(i32 addrspace(1)* nocapture %arg) {
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = sext i1 %cmp to i32
   %add = sub i32 %v, %ext
-  store i32 %add, i32 addrspace(1)* %gep, align 4
+  store i32 %add, ptr addrspace(1) %gep, align 4
   ret void
 }
 
 ; sub x, zext (setcc) => subcarry x, 0, setcc
-define amdgpu_kernel void @cmp_sub_zext(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) {
 ; GCN-LABEL: cmp_sub_zext:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
@@ -549,16 +549,16 @@ define amdgpu_kernel void @cmp_sub_zext(i32 addrspace(1)* nocapture %arg) {
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = zext i1 %cmp to i32
   %add = sub i32 %v, %ext
-  store i32 %add, i32 addrspace(1)* %gep, align 4
+  store i32 %add, ptr addrspace(1) %gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @sub_addcarry(i32 addrspace(1)* nocapture %arg, i32 %a) {
+define amdgpu_kernel void @sub_addcarry(ptr addrspace(1) nocapture %arg, i32 %a) {
 ; GCN-LABEL: sub_addcarry:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -592,17 +592,17 @@ define amdgpu_kernel void @sub_addcarry(i32 addrspace(1)* nocapture %arg, i32 %a
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = zext i1 %cmp to i32
   %adde = add i32 %v, %ext
   %add2 = sub i32 %adde, %a
-  store i32 %add2, i32 addrspace(1)* %gep, align 4
+  store i32 %add2, ptr addrspace(1) %gep, align 4
   ret void
 }
 
-define amdgpu_kernel void @sub_subcarry(i32 addrspace(1)* nocapture %arg, i32 %a) {
+define amdgpu_kernel void @sub_subcarry(ptr addrspace(1) nocapture %arg, i32 %a) {
 ; GCN-LABEL: sub_subcarry:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
@@ -636,18 +636,18 @@ define amdgpu_kernel void @sub_subcarry(i32 addrspace(1)* nocapture %arg, i32 %a
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = zext i1 %cmp to i32
   %adde = sub i32 %v, %ext
   %add2 = sub i32 %adde, %a
-  store i32 %add2, i32 addrspace(1)* %gep, align 4
+  store i32 %add2, ptr addrspace(1) %gep, align 4
   ret void
 }
 
 ; Check case where sub is commuted with zext
-define amdgpu_kernel void @sub_zext_setcc_commute(i32 addrspace(1)* nocapture %arg, i32 %a, i32%b) {
+define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) {
 ; GCN-LABEL: sub_zext_setcc_commute:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -684,19 +684,19 @@ define amdgpu_kernel void @sub_zext_setcc_commute(i32 addrspace(1)* nocapture %a
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = zext i1 %cmp to i32
   %adde = sub i32 %v, %ext
   %sub = sub i32 %a, %adde
   %sub2 = sub i32 %sub, %b
-  store i32 %sub2, i32 addrspace(1)* %gep, align 4
+  store i32 %sub2, ptr addrspace(1) %gep, align 4
   ret void
 }
 
 ; Check case where sub is commuted with sext
-define amdgpu_kernel void @sub_sext_setcc_commute(i32 addrspace(1)* nocapture %arg, i32 %a, i32%b) {
+define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %arg, i32 %a, i32%b) {
 ; GCN-LABEL: sub_sext_setcc_commute:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -733,14 +733,14 @@ define amdgpu_kernel void @sub_sext_setcc_commute(i32 addrspace(1)* nocapture %a
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
-  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %x
+  %v = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %x, %y
   %ext = sext i1 %cmp to i32
   %adde = sub i32 %v, %ext
   %sub = sub i32 %a, %adde
   %sub2 = sub i32 %sub, %b
-  store i32 %sub2, i32 addrspace(1)* %gep, align 4
+  store i32 %sub2, ptr addrspace(1) %gep, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index 92682a4b0117..8c0486df1c75 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -9,7 +9,7 @@
 ; CHECK: s_add_i32 [[S2:s[0-9]+]], {{s[0-9]+}}, [[S1]]
 ; CHECK: s_or_b32 {{s[0-9]+}}, [[S2]], 0xc0
 
-define protected amdgpu_kernel void @_Z11test_kernelPii(i32 addrspace(1)* nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
+define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
 entry:
   %cmp = icmp eq i32 %s, 3
   br i1 %cmp, label %if.then, label %if.end
@@ -19,11 +19,11 @@ if.then:                                          ; preds = %entry
   %rem4 = urem i16 %rem.lhs.trunc, 12
   %rem.zext = zext i16 %rem4 to i32
   %idxprom = zext i32 %s to i64
-  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %Ad.coerce, i64 %idxprom
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %Ad.coerce, i64 %idxprom
   %div = lshr i32 %rem.zext, 3
   %or = or i32 %rem.zext, 192
   %add = add nuw nsw i32 %or, %div
-  store i32 %add, i32 addrspace(1)* %arrayidx3, align 4
+  store i32 %add, ptr addrspace(1) %arrayidx3, align 4
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %entry

diff  --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
index a34463446b7c..e2796c316e6e 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @vectorLoadCombine(<4 x i8>* %in, i32* %out) {
+define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) {
 ; GCN-LABEL: vectorLoadCombine:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -15,7 +15,7 @@ define amdgpu_kernel void @vectorLoadCombine(<4 x i8>* %in, i32* %out) {
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
-  %0 = load <4 x i8>, <4 x i8>* %in, align 4
+  %0 = load <4 x i8>, ptr %in, align 4
   %1 = extractelement <4 x i8> %0, i32 0
   %2 = extractelement <4 x i8> %0, i32 1
   %3 = extractelement <4 x i8> %0, i32 2
@@ -30,11 +30,11 @@ entry:
   %zext3 = zext i8 %4 to i32
   %shift3 = shl nuw i32 %zext3, 24
   %insert3 = or i32 %insert2, %shift3
-  store i32 %insert3, i32* %out
+  store i32 %insert3, ptr %out
   ret void
 }
 
-define amdgpu_kernel void @vectorLoadShuffle(<4 x i8>* %in, i32* %out) {
+define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) {
 ; GCN-LABEL: vectorLoadShuffle:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -55,7 +55,7 @@ define amdgpu_kernel void @vectorLoadShuffle(<4 x i8>* %in, i32* %out) {
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
-  %0 = load <4 x i8>, <4 x i8>* %in, align 4
+  %0 = load <4 x i8>, ptr %in, align 4
   %1 = extractelement <4 x i8> %0, i32 0
   %2 = extractelement <4 x i8> %0, i32 1
   %3 = extractelement <4 x i8> %0, i32 2
@@ -70,19 +70,19 @@ entry:
   %zext3 = zext i8 %4 to i32
   %shift3 = shl nuw i32 %zext3, 24
   %insert3 = or i32 %insert2, %shift3
-  store i32 %insert3, i32* %out
+  store i32 %insert3, ptr %out
   ret void
 }
-define i32 @load_2xi16_combine(i16 addrspace(1)* %p) #0 {
+define i32 @load_2xi16_combine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_2xi16_combine:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 1
-  %p.0 = load i16, i16 addrspace(1)* %p, align 4
-  %p.1 = load i16, i16 addrspace(1)* %gep.p, align 4
+  %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 1
+  %p.0 = load i16, ptr addrspace(1) %p, align 4
+  %p.1 = load i16, ptr addrspace(1) %gep.p, align 4
   %zext.0 = zext i16 %p.0 to i32
   %zext.1 = zext i16 %p.1 to i32
   %shl.1 = shl i32 %zext.1, 16
@@ -90,7 +90,7 @@ define i32 @load_2xi16_combine(i16 addrspace(1)* %p) #0 {
   ret i32 %or
 }
 
-define i32 @load_2xi16_noncombine(i16 addrspace(1)* %p) #0 {
+define i32 @load_2xi16_noncombine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_2xi16_noncombine:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -99,9 +99,9 @@ define i32 @load_2xi16_noncombine(i16 addrspace(1)* %p) #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 2
-  %p.0 = load i16, i16 addrspace(1)* %p, align 4
-  %p.1 = load i16, i16 addrspace(1)* %gep.p, align 4
+  %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 2
+  %p.0 = load i16, ptr addrspace(1) %p, align 4
+  %p.1 = load i16, ptr addrspace(1) %gep.p, align 4
   %zext.0 = zext i16 %p.0 to i32
   %zext.1 = zext i16 %p.1 to i32
   %shl.1 = shl i32 %zext.1, 16
@@ -109,16 +109,16 @@ define i32 @load_2xi16_noncombine(i16 addrspace(1)* %p) #0 {
   ret i32 %or
 }
 
-define i64 @load_2xi32_combine(i32 addrspace(1)* %p) #0 {
+define i64 @load_2xi32_combine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_2xi32_combine:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i32, i32 addrspace(1)* %p, i32 1
-  %p.0 = load i32, i32 addrspace(1)* %p, align 4
-  %p.1 = load i32, i32 addrspace(1)* %gep.p, align 4
+  %gep.p = getelementptr i32, ptr addrspace(1) %p, i32 1
+  %p.0 = load i32, ptr addrspace(1) %p, align 4
+  %p.1 = load i32, ptr addrspace(1) %gep.p, align 4
   %zext.0 = zext i32 %p.0 to i64
   %zext.1 = zext i32 %p.1 to i64
   %shl.1 = shl i64 %zext.1, 32
@@ -126,7 +126,7 @@ define i64 @load_2xi32_combine(i32 addrspace(1)* %p) #0 {
   ret i64 %or
 }
 
-define i64 @load_2xi32_noncombine(i32 addrspace(1)* %p) #0 {
+define i64 @load_2xi32_noncombine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_2xi32_noncombine:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -137,9 +137,9 @@ define i64 @load_2xi32_noncombine(i32 addrspace(1)* %p) #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i32, i32 addrspace(1)* %p, i32 2
-  %p.0 = load i32, i32 addrspace(1)* %p, align 4
-  %p.1 = load i32, i32 addrspace(1)* %gep.p, align 4
+  %gep.p = getelementptr i32, ptr addrspace(1) %p, i32 2
+  %p.0 = load i32, ptr addrspace(1) %p, align 4
+  %p.1 = load i32, ptr addrspace(1) %gep.p, align 4
   %zext.0 = zext i32 %p.0 to i64
   %zext.1 = zext i32 %p.1 to i64
   %shl.1 = shl i64 %zext.1, 32
@@ -147,20 +147,20 @@ define i64 @load_2xi32_noncombine(i32 addrspace(1)* %p) #0 {
   ret i64 %or
 }
 
-define i64 @load_4xi16_combine(i16 addrspace(1)* %p) #0 {
+define i64 @load_4xi16_combine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_4xi16_combine:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 1
-  %gep.2p = getelementptr i16, i16 addrspace(1)* %p, i32 2
-  %gep.3p = getelementptr i16, i16 addrspace(1)* %p, i32 3
-  %p.0 = load i16, i16 addrspace(1)* %p, align 4
-  %p.1 = load i16, i16 addrspace(1)* %gep.p, align 4
-  %p.2 = load i16, i16 addrspace(1)* %gep.2p, align 4
-  %p.3 = load i16, i16 addrspace(1)* %gep.3p, align 4
+  %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 1
+  %gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2
+  %gep.3p = getelementptr i16, ptr addrspace(1) %p, i32 3
+  %p.0 = load i16, ptr addrspace(1) %p, align 4
+  %p.1 = load i16, ptr addrspace(1) %gep.p, align 4
+  %p.2 = load i16, ptr addrspace(1) %gep.2p, align 4
+  %p.3 = load i16, ptr addrspace(1) %gep.3p, align 4
   %zext.0 = zext i16 %p.0 to i64
   %zext.1 = zext i16 %p.1 to i64
   %zext.2 = zext i16 %p.2 to i64
@@ -175,7 +175,7 @@ define i64 @load_4xi16_combine(i16 addrspace(1)* %p) #0 {
 }
 
 
-define i64 @load_4xi16_noncombine(i16 addrspace(1)* %p) #0 {
+define i64 @load_4xi16_noncombine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_4xi16_noncombine:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -186,13 +186,13 @@ define i64 @load_4xi16_noncombine(i16 addrspace(1)* %p) #0 {
 ; GCN-NEXT:    v_bfi_b32 v0, s4, v2, v3
 ; GCN-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 3
-  %gep.2p = getelementptr i16, i16 addrspace(1)* %p, i32 2
-  %gep.3p = getelementptr i16, i16 addrspace(1)* %p, i32 1
-  %p.0 = load i16, i16 addrspace(1)* %p, align 4
-  %p.1 = load i16, i16 addrspace(1)* %gep.p, align 4
-  %p.2 = load i16, i16 addrspace(1)* %gep.2p, align 4
-  %p.3 = load i16, i16 addrspace(1)* %gep.3p, align 4
+  %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 3
+  %gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2
+  %gep.3p = getelementptr i16, ptr addrspace(1) %p, i32 1
+  %p.0 = load i16, ptr addrspace(1) %p, align 4
+  %p.1 = load i16, ptr addrspace(1) %gep.p, align 4
+  %p.2 = load i16, ptr addrspace(1) %gep.2p, align 4
+  %p.3 = load i16, ptr addrspace(1) %gep.3p, align 4
   %zext.0 = zext i16 %p.0 to i64
   %zext.1 = zext i16 %p.1 to i64
   %zext.2 = zext i16 %p.2 to i64
@@ -206,7 +206,7 @@ define i64 @load_4xi16_noncombine(i16 addrspace(1)* %p) #0 {
   ret i64 %or.3
 }
 
-define i64 @load_3xi16_combine(i16 addrspace(1)* %p) #0 {
+define i64 @load_3xi16_combine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_3xi16_combine:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -217,11 +217,11 @@ define i64 @load_3xi16_combine(i16 addrspace(1)* %p) #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 1
-  %gep.2p = getelementptr i16, i16 addrspace(1)* %p, i32 2
-  %p.0 = load i16, i16 addrspace(1)* %p, align 4
-  %p.1 = load i16, i16 addrspace(1)* %gep.p, align 4
-  %p.2 = load i16, i16 addrspace(1)* %gep.2p, align 4
+  %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 1
+  %gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2
+  %p.0 = load i16, ptr addrspace(1) %p, align 4
+  %p.1 = load i16, ptr addrspace(1) %gep.p, align 4
+  %p.2 = load i16, ptr addrspace(1) %gep.2p, align 4
   %zext.0 = zext i16 %p.0 to i64
   %zext.1 = zext i16 %p.1 to i64
   %zext.2 = zext i16 %p.2 to i64
@@ -232,7 +232,7 @@ define i64 @load_3xi16_combine(i16 addrspace(1)* %p) #0 {
   ret i64 %or.2
 }
 
-define i64 @load_3xi16_noncombine(i16 addrspace(1)* %p) #0 {
+define i64 @load_3xi16_noncombine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_3xi16_noncombine:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -243,11 +243,11 @@ define i64 @load_3xi16_noncombine(i16 addrspace(1)* %p) #0 {
 ; GCN-NEXT:    v_and_or_b32 v0, v3, s4, v2
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 3
-  %gep.2p = getelementptr i16, i16 addrspace(1)* %p, i32 2
-  %p.0 = load i16, i16 addrspace(1)* %p, align 4
-  %p.1 = load i16, i16 addrspace(1)* %gep.p, align 4
-  %p.2 = load i16, i16 addrspace(1)* %gep.2p, align 4
+  %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 3
+  %gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2
+  %p.0 = load i16, ptr addrspace(1) %p, align 4
+  %p.1 = load i16, ptr addrspace(1) %gep.p, align 4
+  %p.2 = load i16, ptr addrspace(1) %gep.2p, align 4
   %zext.0 = zext i16 %p.0 to i64
   %zext.1 = zext i16 %p.1 to i64
   %zext.2 = zext i16 %p.2 to i64

diff  --git a/llvm/test/CodeGen/AMDGPU/combine_vloads.ll b/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
index 1545d5219b15..08b41ff766ea 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
@@ -12,7 +12,7 @@
 ; EG-LABEL: {{^}}combine_vloads:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define amdgpu_kernel void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+define amdgpu_kernel void @combine_vloads(ptr addrspace(1) nocapture %src, ptr addrspace(1) nocapture %result) nounwind {
 entry:
   br label %for.body
 
@@ -21,21 +21,18 @@ for.exit:                                         ; preds = %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ]
-  %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)*
-  %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)*
-  %vecload2 = load <8 x i32>, <8 x i32> addrspace(1)* %0, align 32
-  %1 = bitcast <8 x i32> %vecload2 to <32 x i8>
-  %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vecload2 = load <8 x i32>, ptr addrspace(1) %src, align 32
+  %0 = bitcast <8 x i32> %vecload2 to <32 x i8>
+  %tmp5 = shufflevector <32 x i8> %0, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tmp8 = shufflevector <32 x i8> %0, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %tmp9 = add nsw <8 x i8> %tmp5, %tmp8
-  %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %tmp12 = shufflevector <32 x i8> %0, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   %tmp13 = add nsw <8 x i8> %tmp9, %tmp12
-  %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tmp16 = shufflevector <32 x i8> %0, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %tmp17 = add nsw <8 x i8> %tmp13, %tmp16
-  %scevgep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %result, i32 %i.01
-  %2 = bitcast <8 x i8> %tmp17 to <2 x i32>
-  %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)*
-  store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8
+  %scevgep = getelementptr <8 x i8>, ptr addrspace(1) %result, i32 %i.01
+  %1 = bitcast <8 x i8> %tmp17 to <2 x i32>
+  store <2 x i32> %1, ptr addrspace(1) %scevgep, align 8
   %tmp19 = add nsw i32 %i.01, 1
   %exitcond = icmp eq i32 %tmp19, 1024
   br i1 %exitcond, label %for.exit, label %for.body

diff  --git a/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
index df9b4a6510cb..6cb49d7f1f0e 100644
--- a/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
@@ -8,13 +8,13 @@ declare float @llvm.fma.f32(float, float, float) nounwind readnone
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, 2.0
 ; SI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_imm_fabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %x = load float, float addrspace(1)* %gep.0
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %x = load float, ptr addrspace(1) %gep.0
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %z = fadd float 2.0, %x.fabs
-  store float %z, float addrspace(1)* %out
+  store float %z, ptr addrspace(1) %out
   ret void
 }
 
@@ -22,14 +22,14 @@ define amdgpu_kernel void @commute_add_imm_fabs_f32(float addrspace(1)* %out, fl
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -4.0
 ; SI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_fabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %x = load float, float addrspace(1)* %gep.0
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %x = load float, ptr addrspace(1) %gep.0
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fneg.fabs = fsub float -0.000000e+00, %x.fabs
   %z = fmul float 4.0, %x.fneg.fabs
-  store float %z, float addrspace(1)* %out
+  store float %z, ptr addrspace(1) %out
   ret void
 }
 
@@ -37,13 +37,13 @@ define amdgpu_kernel void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %ou
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
 ; SI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %x = load float, float addrspace(1)* %gep.0
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %x = load float, ptr addrspace(1) %gep.0
   %x.fneg = fsub float -0.000000e+00, %x
   %z = fmul float 4.0, %x.fneg
-  store float %z, float addrspace(1)* %out
+  store float %z, ptr addrspace(1) %out
   ret void
 }
 
@@ -53,13 +53,13 @@ define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, fl
 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x44800000
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
 ; SI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_lit_fabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %x = load float, float addrspace(1)* %gep.0
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %x = load float, ptr addrspace(1) %gep.0
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %z = fadd float 1024.0, %x.fabs
-  store float %z, float addrspace(1)* %out
+  store float %z, ptr addrspace(1) %out
   ret void
 }
 
@@ -68,15 +68,15 @@ define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, fl
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_fabs_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load volatile float, float addrspace(1)* %gep.0
-  %y = load volatile float, float addrspace(1)* %gep.1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %x = load volatile float, ptr addrspace(1) %gep.0
+  %y = load volatile float, ptr addrspace(1) %gep.1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %z = fadd float %x, %y.fabs
-  store float %z, float addrspace(1)* %out
+  store float %z, ptr addrspace(1) %out
   ret void
 }
 
@@ -85,15 +85,15 @@ define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
 ; SI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load volatile float, float addrspace(1)* %gep.0
-  %y = load volatile float, float addrspace(1)* %gep.1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %x = load volatile float, ptr addrspace(1) %gep.0
+  %y = load volatile float, ptr addrspace(1) %gep.1
   %y.fneg = fsub float -0.000000e+00, %y
   %z = fmul float %x, %y.fneg
-  store float %z, float addrspace(1)* %out
+  store float %z, ptr addrspace(1) %out
   ret void
 }
 
@@ -102,16 +102,16 @@ define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load volatile float, float addrspace(1)* %gep.0
-  %y = load volatile float, float addrspace(1)* %gep.1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %x = load volatile float, ptr addrspace(1) %gep.0
+  %y = load volatile float, ptr addrspace(1) %gep.1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
   %z = fmul float %x, %y.fabs.fneg
-  store float %z, float addrspace(1)* %out
+  store float %z, ptr addrspace(1) %out
   ret void
 }
 
@@ -121,16 +121,16 @@ define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, f
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load volatile float, float addrspace(1)* %gep.0
-  %y = load volatile float, float addrspace(1)* %gep.1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %x = load volatile float, ptr addrspace(1) %gep.0
+  %y = load volatile float, ptr addrspace(1) %gep.1
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %z = fmul float %x.fabs, %y.fabs
-  store float %z, float addrspace(1)* %out
+  store float %z, ptr addrspace(1) %out
   ret void
 }
 
@@ -139,17 +139,17 @@ define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %ou
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %x = load volatile float, float addrspace(1)* %gep.0
-  %y = load volatile float, float addrspace(1)* %gep.1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %x = load volatile float, ptr addrspace(1) %gep.0
+  %y = load volatile float, ptr addrspace(1) %gep.1
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %y.fabs = call float @llvm.fabs.f32(float %y) #1
   %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
   %z = fmul float %x.fabs, %y.fabs.fneg
-  store float %z, float addrspace(1)* %out
+  store float %z, ptr addrspace(1) %out
   ret void
 }
 
@@ -161,19 +161,19 @@ define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)
 ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]|
 ; SI: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fma_a_2.0_neg_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
 
-  %r1 = load volatile float, float addrspace(1)* %gep.0
-  %r2 = load volatile float, float addrspace(1)* %gep.1
+  %r1 = load volatile float, ptr addrspace(1) %gep.0
+  %r2 = load volatile float, ptr addrspace(1) %gep.1
 
   %r2.fabs = call float @llvm.fabs.f32(float %r2)
 
   %r3 = tail call float @llvm.fma.f32(float %r1, float 2.0, float %r2.fabs)
-  store float %r3, float addrspace(1)* %gep.out
+  store float %r3, ptr addrspace(1) %gep.out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ctpop.ll b/llvm/test/CodeGen/AMDGPU/ctpop.ll
index 2eeac016c013..1c8b97832e44 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop.ll
@@ -18,9 +18,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  store i32 %ctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -32,12 +32,12 @@ define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val)
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  store i32 %ctpop, i32 addrspace(1)* %out, align 4
+  store i32 %ctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -54,16 +54,16 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
+define amdgpu_kernel void @v_ctpop_add_chain_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
-  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
-  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep, align 4
-  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep, align 4
+  %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %tid
+  %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %tid
+  %val0 = load volatile i32, ptr addrspace(1) %in0.gep, align 4
+  %val1 = load volatile i32, ptr addrspace(1) %in1.gep, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
   %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
   %add = add i32 %ctpop0, %ctpop1
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -73,13 +73,13 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out,
 ; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %sval) nounwind {
+define amdgpu_kernel void @v_ctpop_add_sgpr_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %sval) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, %sval
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -90,12 +90,12 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out,
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
+  %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 %tid
+  %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8
   %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
-  store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
+  store <2 x i32> %ctpop, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -110,12 +110,12 @@ define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
+  %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 %tid
+  %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16
   %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
-  store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %ctpop, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -138,12 +138,12 @@ define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v8i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tid
-  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in.gep, align 32
+  %in.gep = getelementptr <8 x i32>, ptr addrspace(1) %in, i32 %tid
+  %val = load <8 x i32>, ptr addrspace(1) %in.gep, align 32
   %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
-  store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
+  store <8 x i32> %ctpop, ptr addrspace(1) %out, align 32
   ret void
 }
 
@@ -182,12 +182,12 @@ define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v16i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tid
-  %val = load <16 x i32>, <16 x i32> addrspace(1)* %in.gep, align 32
+  %in.gep = getelementptr <16 x i32>, ptr addrspace(1) %in, i32 %tid
+  %val = load <16 x i32>, ptr addrspace(1) %in.gep, align 32
   %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
-  store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
+  store <16 x i32> %ctpop, ptr addrspace(1) %out, align 32
   ret void
 }
 
@@ -198,13 +198,13 @@ define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out,
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 4
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -215,13 +215,13 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noa
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 4, %ctpop
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -232,13 +232,13 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)*
 ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 99999
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -250,13 +250,13 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %const) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, %const
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -268,13 +268,13 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %const) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %const, %ctpop
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -289,15 +289,15 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
-  %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 %tid
-  %const = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr i32, ptr addrspace(1) %constptr, i32 %tid
+  %const = load i32, ptr addrspace(1) %gep, align 4
   %add = add i32 %const, %ctpop
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -309,7 +309,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %o
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 ; EG: BCNT_INT
-define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, [8 x i32], i32 %cond) {
+define amdgpu_kernel void @ctpop_i32_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %ctpop_arg, [8 x i32], i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
@@ -319,12 +319,12 @@ if:
   br label %endif
 
 else:
-  %tmp3 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %tmp4 = load i32, i32 addrspace(1)* %tmp3
+  %tmp3 = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %tmp4 = load i32, ptr addrspace(1) %tmp3
   br label %endif
 
 endif:
   %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else]
-  store i32 %tmp5, i32 addrspace(1)* %out
+  store i32 %tmp5, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index ff4b78af5718..1b2bca5cdc00 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -11,7 +11,7 @@ declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) nounwind readnone
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
-define amdgpu_kernel void @s_ctpop_i16(i16 addrspace(1)* noalias %out, i16 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) nounwind {
 ; SI-LABEL: s_ctpop_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -64,12 +64,12 @@ define amdgpu_kernel void @s_ctpop_i16(i16 addrspace(1)* noalias %out, i16 %val)
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
-  store i16 %ctpop, i16 addrspace(1)* %out, align 4
+  store i16 %ctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; XXX - Why 0 in register?
-define amdgpu_kernel void @v_ctpop_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: v_ctpop_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -132,14 +132,14 @@ define amdgpu_kernel void @v_ctpop_i16(i16 addrspace(1)* noalias %out, i16 addrs
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
-  %val = load i16, i16 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
+  %val = load i16, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
-  store i16 %ctpop, i16 addrspace(1)* %out, align 4
+  store i16 %ctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_add_chain_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in0, i16 addrspace(1)* noalias %in1) nounwind {
+define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind {
 ; SI-LABEL: v_ctpop_add_chain_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -225,18 +225,18 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(i16 addrspace(1)* noalias %out,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in0.gep = getelementptr i16, i16 addrspace(1)* %in0, i32 %tid
-  %in1.gep = getelementptr i16, i16 addrspace(1)* %in1, i32 %tid
-  %val0 = load volatile i16, i16 addrspace(1)* %in0.gep, align 4
-  %val1 = load volatile i16, i16 addrspace(1)* %in1.gep, align 4
+  %in0.gep = getelementptr i16, ptr addrspace(1) %in0, i32 %tid
+  %in1.gep = getelementptr i16, ptr addrspace(1) %in1, i32 %tid
+  %val0 = load volatile i16, ptr addrspace(1) %in0.gep, align 4
+  %val1 = load volatile i16, ptr addrspace(1) %in1.gep, align 4
   %ctpop0 = call i16 @llvm.ctpop.i16(i16 %val0) nounwind readnone
   %ctpop1 = call i16 @llvm.ctpop.i16(i16 %val1) nounwind readnone
   %add = add i16 %ctpop0, %ctpop1
-  store i16 %add, i16 addrspace(1)* %out, align 4
+  store i16 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_add_sgpr_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %sval) nounwind {
+define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %sval) nounwind {
 ; SI-LABEL: v_ctpop_add_sgpr_i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -309,15 +309,15 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(i16 addrspace(1)* noalias %out,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
-  %val = load i16, i16 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
+  %val = load i16, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
   %add = add i16 %ctpop, %sval
-  store i16 %add, i16 addrspace(1)* %out, align 4
+  store i16 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_v2i16(<2 x i16> addrspace(1)* noalias %out, <2 x i16> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: v_ctpop_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -390,14 +390,14 @@ define amdgpu_kernel void @v_ctpop_v2i16(<2 x i16> addrspace(1)* noalias %out, <
 ; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
-  %val = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep, align 8
+  %in.gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid
+  %val = load <2 x i16>, ptr addrspace(1) %in.gep, align 8
   %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %val) nounwind readnone
-  store <2 x i16> %ctpop, <2 x i16> addrspace(1)* %out, align 8
+  store <2 x i16> %ctpop, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <4 x i16> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: v_ctpop_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -514,14 +514,14 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <
 ; EG-NEXT:     MOV T5.X, PV.Y,
 ; EG-NEXT:     MOV * T8.X, T4.X,
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
-  %val = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep, align 16
+  %in.gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid
+  %val = load <4 x i16>, ptr addrspace(1) %in.gep, align 16
   %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %val) nounwind readnone
-  store <4 x i16> %ctpop, <4 x i16> addrspace(1)* %out, align 16
+  store <4 x i16> %ctpop, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <8 x i16> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: v_ctpop_v8i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -694,14 +694,14 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <
 ; EG-NEXT:     MOV * T0.X, T4.X,
 ; EG-NEXT:     MOV * T0.Z, T8.X,
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr <8 x i16>, <8 x i16> addrspace(1)* %in, i32 %tid
-  %val = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep, align 32
+  %in.gep = getelementptr <8 x i16>, ptr addrspace(1) %in, i32 %tid
+  %val = load <8 x i16>, ptr addrspace(1) %in.gep, align 32
   %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val) nounwind readnone
-  store <8 x i16> %ctpop, <8 x i16> addrspace(1)* %out, align 32
+  store <8 x i16> %ctpop, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out, <16 x i16> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: v_ctpop_v16i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1010,14 +1010,14 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out,
 ; EG-NEXT:     MOV T20.X, T12.X,
 ; EG-NEXT:     MOV * T20.Z, T16.X, BS:VEC_120/SCL_212
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr <16 x i16>, <16 x i16> addrspace(1)* %in, i32 %tid
-  %val = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep, align 32
+  %in.gep = getelementptr <16 x i16>, ptr addrspace(1) %in, i32 %tid
+  %val = load <16 x i16>, ptr addrspace(1) %in.gep, align 32
   %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %val) nounwind readnone
-  store <16 x i16> %ctpop, <16 x i16> addrspace(1)* %out, align 32
+  store <16 x i16> %ctpop, ptr addrspace(1) %out, align 32
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: v_ctpop_i16_add_inline_constant:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1081,15 +1081,15 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(i16 addrspace(1)* noa
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
-  %val = load i16, i16 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
+  %val = load i16, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
   %add = add i16 %ctpop, 4
-  store i16 %add, i16 addrspace(1)* %out, align 4
+  store i16 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: v_ctpop_i16_add_inline_constant_inv:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1153,15 +1153,15 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(i16 addrspace(1)*
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
-  %val = load i16, i16 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
+  %val = load i16, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
   %add = add i16 4, %ctpop
-  store i16 %add, i16 addrspace(1)* %out, align 4
+  store i16 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_i16_add_literal(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: v_ctpop_i16_add_literal:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1227,15 +1227,15 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(i16 addrspace(1)* noalias %ou
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
-  %val = load i16, i16 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
+  %val = load i16, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
   %add = add i16 %ctpop, 999
-  store i16 %add, i16 addrspace(1)* %out, align 4
+  store i16 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_i16_add_var(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind {
 ; SI-LABEL: v_ctpop_i16_add_var:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1308,15 +1308,15 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(i16 addrspace(1)* noalias %out, i
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
-  %val = load i16, i16 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
+  %val = load i16, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
   %add = add i16 %ctpop, %const
-  store i16 %add, i16 addrspace(1)* %out, align 4
+  store i16 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_i16_add_var_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind {
 ; SI-LABEL: v_ctpop_i16_add_var_inv:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1389,15 +1389,15 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(i16 addrspace(1)* noalias %ou
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
-  %val = load i16, i16 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
+  %val = load i16, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
   %add = add i16 %const, %ctpop
-  store i16 %add, i16 addrspace(1)* %out, align 4
+  store i16 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 addrspace(1)* noalias %constptr) nounwind {
+define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind {
 ; SI-LABEL: v_ctpop_i16_add_vvar_inv:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -1476,19 +1476,19 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %o
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
-  %val = load i16, i16 addrspace(1)* %in.gep, align 4
+  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
+  %val = load i16, ptr addrspace(1) %in.gep, align 4
   %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
-  %gep = getelementptr i16, i16 addrspace(1)* %constptr, i32 %tid
-  %const = load i16, i16 addrspace(1)* %gep, align 4
+  %gep = getelementptr i16, ptr addrspace(1) %constptr, i32 %tid
+  %const = load i16, ptr addrspace(1) %gep, align 4
   %add = add i16 %const, %ctpop
-  store i16 %add, i16 addrspace(1)* %out, align 4
+  store i16 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FIXME: We currently disallow SALU instructions in all branches,
 ; but there are some cases when the should be allowed.
-define amdgpu_kernel void @ctpop_i16_in_br(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %ctpop_arg, i16 %cond) {
+define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %ctpop_arg, i16 %cond) {
 ; SI-LABEL: ctpop_i16_in_br:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
@@ -1619,12 +1619,12 @@ if:
   br label %endif
 
 else:
-  %tmp3 = getelementptr i16, i16 addrspace(1)* %in, i16 1
-  %tmp4 = load i16, i16 addrspace(1)* %tmp3
+  %tmp3 = getelementptr i16, ptr addrspace(1) %in, i16 1
+  %tmp4 = load i16, ptr addrspace(1) %tmp3
   br label %endif
 
 endif:
   %tmp5 = phi i16 [%tmp2, %if], [%tmp4, %else]
-  store i16 %tmp5, i16 addrspace(1)* %out
+  store i16 %tmp5, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 7e188a9f76cb..5ed4c46a1fb8 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -19,10 +19,10 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
-  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  store i32 %truncctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -33,13 +33,13 @@ define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, [8 x i32]
 ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %val = load i64, i64 addrspace(1)* %in.gep, align 8
+  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %val = load i64, ptr addrspace(1) %in.gep, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
-  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  store i32 %truncctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -52,13 +52,13 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
+define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
-  %val = load i64, i64 addrspace(1)* %in.gep, align 8
+  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
+  %val = load i64, ptr addrspace(1) %in.gep, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %or = or i64 %ctpop, %s.val
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   ret void
 }
 
@@ -66,10 +66,10 @@ define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind {
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
-  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
+  store <2 x i32> %truncctpop, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -79,10 +79,10 @@ define amdgpu_kernel void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind {
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
-  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %truncctpop, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -92,13 +92,13 @@ define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i32 %tid
-  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in.gep, align 16
+  %in.gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
+  %val = load <2 x i64>, ptr addrspace(1) %in.gep, align 16
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
-  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
+  store <2 x i32> %truncctpop, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -112,13 +112,13 @@ define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
-  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep, align 32
+  %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
+  %val = load <4 x i64>, ptr addrspace(1) %in.gep, align 32
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
-  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
+  store <4 x i32> %truncctpop, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -131,7 +131,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
@@ -141,13 +141,13 @@ if:
   br label %endif
 
 else:
-  %tmp3 = getelementptr i64, i64 addrspace(1)* %in, i32 1
-  %tmp4 = load i64, i64 addrspace(1)* %tmp3
+  %tmp3 = getelementptr i64, ptr addrspace(1) %in, i32 1
+  %tmp4 = load i64, ptr addrspace(1) %tmp3
   br label %endif
 
 endif:
   %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else]
-  store i64 %tmp5, i64 addrspace(1)* %out
+  store i64 %tmp5, ptr addrspace(1) %out
   ret void
 }
 
@@ -156,10 +156,10 @@ endif:
 ; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]],
 ; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind {
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32
-  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  store i32 %truncctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -169,10 +169,10 @@ define amdgpu_kernel void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %va
 ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
 ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind {
   %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
   %truncctpop = trunc i65 %ctpop to i32
-  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  store i32 %truncctpop, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -192,12 +192,12 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val)
 
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %tid
-  %val = load i128, i128 addrspace(1)* %in.gep, align 8
+  %in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %tid
+  %val = load i128, ptr addrspace(1) %in.gep, align 8
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32
-  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
+  store i32 %truncctpop, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 1ada8947b6f5..102bd99b5085 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -191,7 +191,7 @@ define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.8 = lshr i32 %arg0, 8
-  store i32 %lshr.8, i32 addrspace(1)* undef
+  store i32 %lshr.8, ptr addrspace(1) undef
   %masked = and i32 %lshr.8, 255
   %cvt = uitofp i32 %masked to float
   ret float %cvt
@@ -945,7 +945,7 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
   ret double %cvt
 }
 
-define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: load_i8_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1013,14 +1013,14 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 a
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
-  %load = load i8, i8 addrspace(1)* %gep, align 1
+  %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
+  %load = load i8, ptr addrspace(1) %gep, align 1
   %cvt = uitofp i8 %load to float
-  store float %cvt, float addrspace(1)* %out, align 4
+  store float %cvt, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: load_v2i8_to_v2f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1098,14 +1098,14 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
-  %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
+  %gep = getelementptr <2 x i8>, ptr addrspace(1) %in, i32 %tid
+  %load = load <2 x i8>, ptr addrspace(1) %gep, align 2
   %cvt = uitofp <2 x i8> %load to <2 x float>
-  store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
+  store <2 x float> %cvt, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: load_v3i8_to_v3f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1188,14 +1188,14 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
-  %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
+  %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid
+  %load = load <3 x i8>, ptr addrspace(1) %gep, align 4
   %cvt = uitofp <3 x i8> %load to <3 x float>
-  store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
+  store <3 x float> %cvt, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: load_v4i8_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1283,10 +1283,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
+  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
+  %load = load <4 x i8>, ptr addrspace(1) %gep, align 4
   %cvt = uitofp <4 x i8> %load to <4 x float>
-  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
   ret void
 }
 
@@ -1294,7 +1294,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias
 ; position in the word for the component.
 
 ; FIXME: Packing bytes
-define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: load_v4i8_to_v4f32_unaligned:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1419,16 +1419,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
+  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
+  %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
   %cvt = uitofp <4 x i8> %load to <4 x float>
-  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
   ret void
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; Instructions still emitted to repack bytes for add use.
-define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: load_v4i8_to_v4f32_2_uses:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -1612,17 +1612,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
-  %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
+  %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
+  %load = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4
   %cvt = uitofp <4 x i8> %load to <4 x float>
-  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
   %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
-  store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
+  store <4 x i8> %add, ptr addrspace(1) %out2, align 4
   ret void
 }
 
 ; Make sure this doesn't crash.
-define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: load_v7i8_to_v7f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1798,14 +1798,14 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
-  %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
+  %gep = getelementptr <7 x i8>, ptr addrspace(1) %in, i32 %tid
+  %load = load <7 x i8>, ptr addrspace(1) %gep, align 1
   %cvt = uitofp <7 x i8> %load to <7 x float>
-  store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
+  store <7 x float> %cvt, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: load_v8i8_to_v8f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1919,14 +1919,14 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
-  %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
+  %gep = getelementptr <8 x i8>, ptr addrspace(1) %in, i32 %tid
+  %load = load <8 x i8>, ptr addrspace(1) %gep, align 8
   %cvt = uitofp <8 x i8> %load to <8 x float>
-  store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
+  store <8 x float> %cvt, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: i8_zext_inreg_i32_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2004,16 +2004,16 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %load = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %load = load i32, ptr addrspace(1) %gep, align 4
   %add = add i32 %load, 2
   %inreg = and i32 %add, 255
   %cvt = uitofp i32 %inreg to float
-  store float %cvt, float addrspace(1)* %out, align 4
+  store float %cvt, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: i8_zext_inreg_hi1_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2085,18 +2085,18 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %load = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %load = load i32, ptr addrspace(1) %gep, align 4
   %inreg = and i32 %load, 65280
   %shr = lshr i32 %inreg, 8
   %cvt = uitofp i32 %shr to float
-  store float %cvt, float addrspace(1)* %out, align 4
+  store float %cvt, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; We don't get these ones because of the zext, but instcombine removes
 ; them so it shouldn't really matter.
-define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: i8_zext_i32_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2164,15 +2164,15 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out,
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
-  %load = load i8, i8 addrspace(1)* %gep, align 1
+  %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
+  %load = load i8, ptr addrspace(1) %gep, align 1
   %ext = zext i8 %load to i32
   %cvt = uitofp i32 %ext to float
-  store float %cvt, float addrspace(1)* %out, align 4
+  store float %cvt, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2297,15 +2297,15 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
+  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
+  %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
   %ext = zext <4 x i8> %load to <4 x i32>
   %cvt = uitofp <4 x i32> %ext to <4 x float>
-  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
   ret void
 }
 
-define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: extract_byte0_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2377,15 +2377,15 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep
   %and = and i32 %val, 255
   %cvt = uitofp i32 %and to float
-  store float %cvt, float addrspace(1)* %out
+  store float %cvt, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: extract_byte1_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2457,16 +2457,16 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep
   %srl = lshr i32 %val, 8
   %and = and i32 %srl, 255
   %cvt = uitofp i32 %and to float
-  store float %cvt, float addrspace(1)* %out
+  store float %cvt, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: extract_byte2_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2538,16 +2538,16 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep
   %srl = lshr i32 %val, 16
   %and = and i32 %srl, 255
   %cvt = uitofp i32 %and to float
-  store float %cvt, float addrspace(1)* %out
+  store float %cvt, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
 ; SI-LABEL: extract_byte3_to_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2619,16 +2619,16 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
-  %val = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
+  %val = load i32, ptr addrspace(1) %gep
   %srl = lshr i32 %val, 24
   %and = and i32 %srl, 255
   %cvt = uitofp i32 %and to float
-  store float %cvt, float addrspace(1)* %out
+  store float %cvt, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
+define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; SI-LABEL: cvt_ubyte0_or_multiuse:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2714,14 +2714,14 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float a
 ; GFX11-NEXT:    s_endpgm
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid
-  %load = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %lid
+  %load = load i32, ptr addrspace(1) %gep
   %or = or i32 %load, -2147483647
   %and = and i32 %or, 255
   %uitofp = uitofp i32 %and to float
   %cast = bitcast i32 %or to float
   %add = fadd float %cast, %uitofp
-  store float %add, float addrspace(1)* %out
+  store float %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -2857,15 +2857,14 @@ entry:
   br label %for.body.i
 
 for.body.i:                                       ; preds = %for.body.i, %entry
-  %retval.sroa.0.0.copyload = load %Vec*, %Vec* addrspace(1)* undef, align 8
-  %add.ptr = getelementptr inbounds %Vec, %Vec* %retval.sroa.0.0.copyload, i64 undef
-  %retval.sroa.0.0..sroa_cast = bitcast %Vec* %add.ptr to i32*
-  %retval.sroa.0.0..sroa_cast_adr = addrspacecast i32* %retval.sroa.0.0..sroa_cast to i32 addrspace(1)*
-  %retval.sroa.0.0.copyload.i = load i32, i32 addrspace(1)* %retval.sroa.0.0..sroa_cast_adr, align 1
+  %retval.sroa.0.0.copyload = load ptr, ptr addrspace(1) undef, align 8
+  %add.ptr = getelementptr inbounds %Vec, ptr %retval.sroa.0.0.copyload, i64 undef
+  %retval.sroa.0.0..sroa_cast_adr = addrspacecast ptr %add.ptr to ptr addrspace(1)
+  %retval.sroa.0.0.copyload.i = load i32, ptr addrspace(1) %retval.sroa.0.0..sroa_cast_adr, align 1
   %p1.sroa.6.0.extract.shift = lshr i32 %retval.sroa.0.0.copyload.i, 24
   %p1.sroa.6.0.extract.trunc = trunc i32 %p1.sroa.6.0.extract.shift to i8
   %conv12 = uitofp i8 %p1.sroa.6.0.extract.trunc to float
-  %0 = load float, float addrspace(1)* undef, align 8
+  %0 = load float, ptr addrspace(1) undef, align 8
   %mul = fmul contract float %0, %conv12
   %add = fadd contract float %mul, 5.000000e-01
   %conv13 = fptoui float %add to i8
@@ -2877,6 +2876,6 @@ for.body.i:                                       ; preds = %for.body.i, %entry
   %retval.sroa.2.0.insert.insert = or i32 %retval.sroa.3.0.insert.insert, %retval.sroa.2.0.insert.ext
   %retval.sroa.0.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 255
   %retval.sroa.0.0.insert.insert = or i32 %retval.sroa.2.0.insert.insert, %retval.sroa.0.0.insert.ext
-  store i32 %retval.sroa.0.0.insert.insert, i32 addrspace(1)* undef, align 1
+  store i32 %retval.sroa.0.0.insert.insert, ptr addrspace(1) undef, align 1
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll b/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
index c10cf1a8a6f2..36a35cbd1a3b 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
@@ -10,10 +10,10 @@ declare float @llvm.floor.f32(float) #1
 ; SI-NOT: add
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define amdgpu_kernel void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_0(ptr addrspace(1) %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
+  store i32 %cvt, ptr addrspace(1) %out
   ret void
 }
 
@@ -22,11 +22,11 @@ define amdgpu_kernel void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define amdgpu_kernel void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_1(ptr addrspace(1) %out, float %x) #0 {
   %fadd = fadd float %x, 1.0
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
+  store i32 %cvt, ptr addrspace(1) %out
   ret void
 }
 
@@ -35,11 +35,11 @@ define amdgpu_kernel void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
 ; SI: s_endpgm
-define amdgpu_kernel void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs(ptr addrspace(1) %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %floor = call float @llvm.floor.f32(float %x.fabs) #1
   %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
+  store i32 %cvt, ptr addrspace(1) %out
   ret void
 }
 
@@ -48,11 +48,11 @@ define amdgpu_kernel void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
 ; SI: s_endpgm
-define amdgpu_kernel void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fneg(ptr addrspace(1) %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %floor = call float @llvm.floor.f32(float %x.fneg) #1
   %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
+  store i32 %cvt, ptr addrspace(1) %out
   ret void
 }
 
@@ -61,12 +61,12 @@ define amdgpu_kernel void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
 ; SI: s_endpgm
-define amdgpu_kernel void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs_fneg(ptr addrspace(1) %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
   %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
+  store i32 %cvt, ptr addrspace(1) %out
   ret void
 }
 
@@ -75,10 +75,10 @@ define amdgpu_kernel void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, flo
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32_e32
 ; SI: s_endpgm
-define amdgpu_kernel void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_flr_i32_f32_0(ptr addrspace(1) %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptoui float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
+  store i32 %cvt, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
index 9b771ebdf7b3..1526f22e60d3 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
@@ -9,11 +9,11 @@ declare float @llvm.floor.f32(float) #1
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define amdgpu_kernel void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32(ptr addrspace(1) %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
+  store i32 %cvt, ptr addrspace(1) %out
   ret void
 }
 
@@ -21,12 +21,12 @@ define amdgpu_kernel void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs(ptr addrspace(1) %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %fadd = fadd float %x.fabs, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
+  store i32 %cvt, ptr addrspace(1) %out
   ret void
 }
 
@@ -37,12 +37,12 @@ define amdgpu_kernel void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define amdgpu_kernel void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fneg(ptr addrspace(1) %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %fadd = fadd float %x.fneg, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
+  store i32 %cvt, ptr addrspace(1) %out
   ret void
 }
 
@@ -55,13 +55,13 @@ define amdgpu_kernel void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define amdgpu_kernel void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs_fneg(ptr addrspace(1) %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %fadd = fadd float %x.fabs.fneg, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
+  store i32 %cvt, ptr addrspace(1) %out
   ret void
 }
 
@@ -71,11 +71,11 @@ define amdgpu_kernel void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, flo
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32
 ; SI: s_endpgm
-define amdgpu_kernel void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_rpi_i32_f32_0(ptr addrspace(1) %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptoui float %floor to i32
-  store i32 %cvt, i32 addrspace(1)* %out
+  store i32 %cvt, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
index c617eeeac6f4..b876f1ac9706 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
@@ -39,15 +39,15 @@ out.else:
   ret i32 %x
 }
 
-define amdgpu_kernel void @uniform_opt_lshr_and_cmp(i1 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @uniform_opt_lshr_and_cmp(ptr addrspace(1) %out, i32 %x) {
   ; GCN-LABEL: name: uniform_opt_lshr_and_cmp
   ; GCN: bb.0.entry:
   ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GCN-NEXT:   liveins: $sgpr0_sgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset.cast, align 4, addrspace 4)
-  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset.cast, addrspace 4)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2
   ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def dead $scc
@@ -89,10 +89,10 @@ entry:
 
 out.true:
   %2 = xor i1 %1, -1
-  store i1 %2, i1 addrspace(1)* %out
+  store i1 %2, ptr addrspace(1) %out
   ret void
 
 out.else:
-  store i1 %1, i1 addrspace(1)* %out
+  store i1 %1, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
index 11acbc274eb5..7e1048db929e 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
@@ -9,22 +9,22 @@
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 
-define amdgpu_kernel void @store_same_base_ptr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_same_base_ptr(ptr addrspace(1) %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #0
   %offset = sext i32 %id to i64
   %offset0 = add i64 %offset, 1027
-  %ptr0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset0
-  store volatile i32 3, i32 addrspace(1)* %ptr0
+  %ptr0 = getelementptr i32, ptr addrspace(1) %out, i64 %offset0
+  store volatile i32 3, ptr addrspace(1) %ptr0
   %offset1 = add i64 %offset, 1026
-  %ptr1 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset1
-  store volatile i32 2, i32 addrspace(1)* %ptr1
+  %ptr1 = getelementptr i32, ptr addrspace(1) %out, i64 %offset1
+  store volatile i32 2, ptr addrspace(1) %ptr1
   %offset2 = add i64 %offset, 1025
-  %ptr2 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset2
-  store volatile i32 1, i32 addrspace(1)* %ptr2
+  %ptr2 = getelementptr i32, ptr addrspace(1) %out, i64 %offset2
+  store volatile i32 1, ptr addrspace(1) %ptr2
   %offset3 = add i64 %offset, 1024
-  %ptr3 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset3
-  store volatile i32 0, i32 addrspace(1)* %ptr3
+  %ptr3 = getelementptr i32, ptr addrspace(1) %out, i64 %offset3
+  store volatile i32 0, ptr addrspace(1) %ptr3
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
index fee9e03c360f..ba53c45356c0 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx908 < %s | FileCheck %s
 
-define void @wombat(i1 %cond, <1 x i8> addrspace(5)* %addr) {
+define void @wombat(i1 %cond, ptr addrspace(5) %addr) {
 ; CHECK-LABEL: wombat:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -20,7 +20,7 @@ define void @wombat(i1 %cond, <1 x i8> addrspace(5)* %addr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %load = load <1 x i8>, <1 x i8> addrspace(5)* %addr, align 1
+  %load = load <1 x i8>, ptr addrspace(5) %addr, align 1
   br i1 %cond, label %then, label %end
 
 then:
@@ -28,6 +28,6 @@ then:
 
 end:
   %phi_value = phi <1 x i8> [%load, %entry], [zeroinitializer, %then]
-  store <1 x i8> %phi_value, <1 x i8> addrspace(5)* %addr, align 1
+  store <1 x i8> %phi_value, ptr addrspace(5) %addr, align 1
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
index ceff889b3a7e..28a6a9c6a0e7 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@@ -10,27 +10,27 @@
 ; CHECK: {{^}}sint:
 ; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define amdgpu_kernel void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sint(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %sint = load i32, i32 addrspace(1) * %in
+  %ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %sint = load i32, ptr addrspace(1) %in
   %conv = sitofp i32 %sint to float
   %0 = insertelement <4 x float> undef, float %conv, i32 0
   %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
-  store <4 x float> %splat, <4 x float> addrspace(1)* %out
+  store <4 x float> %splat, ptr addrspace(1) %out
   ret void
 }
 
 ;CHECK: {{^}}uint:
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define amdgpu_kernel void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @uint(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %uint = load i32, i32 addrspace(1) * %in
+  %ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %uint = load i32, ptr addrspace(1) %in
   %conv = uitofp i32 %uint to float
   %0 = insertelement <4 x float> undef, float %conv, i32 0
   %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
-  store <4 x float> %splat, <4 x float> addrspace(1)* %out
+  store <4 x float> %splat, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/dead-machine-elim-after-dead-lane.ll b/llvm/test/CodeGen/AMDGPU/dead-machine-elim-after-dead-lane.ll
index da4bce27b5c5..f8fda342b121 100644
--- a/llvm/test/CodeGen/AMDGPU/dead-machine-elim-after-dead-lane.ll
+++ b/llvm/test/CodeGen/AMDGPU/dead-machine-elim-after-dead-lane.ll
@@ -12,7 +12,7 @@ entry:
   ]
 
 sw.bb4:
-  %x = load i64, i64 addrspace(1)* undef, align 8
+  %x = load i64, ptr addrspace(1) undef, align 8
   %c = sitofp i64 %x to float
   %v = insertelement <2 x float> <float undef, float 0.000000e+00>, float %c, i32 0
   br label %foo.exit
@@ -23,6 +23,6 @@ sw.bb10:
 foo.exit:
   %agg = phi <2 x float> [ %v, %sw.bb4 ], [ zeroinitializer, %entry ]
   %s = extractelement <2 x float> %agg, i32 1
-  store float %s, float addrspace(1)* undef, align 4
+  store float %s, ptr addrspace(1) undef, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll b/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll
index d11abec23f72..0908a24ce178 100644
--- a/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll
+++ b/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll
@@ -7,58 +7,58 @@
 ; GCN: GLOBAL_LOAD_DWORDX4_SADDR
 ; GCN: GLOBAL_LOAD_DWORDX4_SADDR
 ; GCN-NEXT: KILL
-define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
+define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp to i64
-  %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp2
-  %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16
-  %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2
+  %tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
+  %tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
+  %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
   %tmp6 = add nuw nsw i64 %tmp2, 1
-  %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6
-  %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16
-  %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6
+  %tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
+  %tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
+  %tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
   %tmp10 = add nuw nsw i64 %tmp2, 2
-  %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10
-  %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16
-  %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10
+  %tmp11 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp10
+  %tmp12 = load <4 x i32>, ptr addrspace(1) %tmp11, align 16
+  %tmp13 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp10
   %tmp14 = add nuw nsw i64 %tmp2, 3
-  %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14
-  %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16
-  %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14
-  store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16
-  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16
-  store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16
-  store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16
+  %tmp15 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp14
+  %tmp16 = load <4 x i32>, ptr addrspace(1) %tmp15, align 16
+  %tmp17 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp14
+  store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
+  store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
+  store <4 x i32> %tmp12, ptr addrspace(1) %tmp13, align 16
+  store <4 x i32> %tmp16, ptr addrspace(1) %tmp17, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}name:{{[ 	]*}}no_vector_clause
 ; GCN-NOT:   BUNDLE
 ; GCN-NOT:   KILL
-define amdgpu_kernel void @no_vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) #0 {
+define amdgpu_kernel void @no_vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp to i64
-  %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp2
-  %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16
-  %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2
+  %tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
+  %tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
+  %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
   %tmp6 = add nuw nsw i64 %tmp2, 1
-  %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6
-  %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16
-  %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6
+  %tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
+  %tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
+  %tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
   %tmp10 = add nuw nsw i64 %tmp2, 2
-  %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10
-  %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16
-  %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10
+  %tmp11 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp10
+  %tmp12 = load <4 x i32>, ptr addrspace(1) %tmp11, align 16
+  %tmp13 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp10
   %tmp14 = add nuw nsw i64 %tmp2, 3
-  %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14
-  %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16
-  %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14
-  store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16
-  store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16
-  store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16
-  store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16
+  %tmp15 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp14
+  %tmp16 = load <4 x i32>, ptr addrspace(1) %tmp15, align 16
+  %tmp17 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp14
+  store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
+  store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
+  store <4 x i32> %tmp12, ptr addrspace(1) %tmp13, align 16
+  store <4 x i32> %tmp16, ptr addrspace(1) %tmp17, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll b/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
index 6dfe1294bb47..392665b50841 100644
--- a/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
@@ -9,7 +9,7 @@
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK-NEXT: JUMP
 ; CHECK-NEXT: LOOP_BREAK
-define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
+define amdgpu_kernel void @loop_ge(ptr addrspace(1) nocapture %out, i32 %iterations) nounwind {
 entry:
   %cmp5 = icmp sgt i32 %iterations, 0
   br i1 %cmp5, label %for.body, label %for.end
@@ -18,8 +18,8 @@ for.body:                                         ; preds = %for.body, %entry
   %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
   %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
   %i.07 = add nsw i32 %i.07.in, -1
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06
-  store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %ai.06
+  store i32 %i.07, ptr addrspace(1) %arrayidx, align 4
   %add = add nsw i32 %ai.06, 1
   %exitcond = icmp eq i32 %add, %iterations
   br i1 %exitcond, label %for.end, label %for.body

diff  --git a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
index 04c889add9f8..441a36062d5e 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp64_combine.ll
@@ -7,15 +7,15 @@
 ; GCN:           global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
 ; DPP64:         v_ceil_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
 ; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
-define amdgpu_kernel void @dpp64_ceil(i64 addrspace(1)* %arg, i64 %in1) {
+define amdgpu_kernel void @dpp64_ceil(ptr addrspace(1) %arg, i64 %in1) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
-  %load = load i64, i64 addrspace(1)* %gep
+  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
+  %load = load i64, ptr addrspace(1) %gep
   %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0
   %tmp1 = bitcast i64 %tmp0 to double
   %round = tail call double @llvm.ceil.f64(double %tmp1)
   %tmp2 = bitcast double %round to i64
-  store i64 %tmp2, i64 addrspace(1)* %gep
+  store i64 %tmp2, ptr addrspace(1) %gep
   ret void
 }
 
@@ -23,30 +23,30 @@ define amdgpu_kernel void @dpp64_ceil(i64 addrspace(1)* %arg, i64 %in1) {
 ; GCN:           global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
 ; DPP64:         v_rcp_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
 ; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
-define amdgpu_kernel void @dpp64_rcp(i64 addrspace(1)* %arg, i64 %in1) {
+define amdgpu_kernel void @dpp64_rcp(ptr addrspace(1) %arg, i64 %in1) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
-  %load = load i64, i64 addrspace(1)* %gep
+  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
+  %load = load i64, ptr addrspace(1) %gep
   %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0
   %tmp1 = bitcast i64 %tmp0 to double
   %rcp = call double @llvm.amdgcn.rcp.f64(double %tmp1)
   %tmp2 = bitcast double %rcp to i64
-  store i64 %tmp2, i64 addrspace(1)* %gep
+  store i64 %tmp2, ptr addrspace(1) %gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}dpp64_rcp_unsupported_ctl:
 ; GCN-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
 ; GCN:         v_rcp_f64_e32
-define amdgpu_kernel void @dpp64_rcp_unsupported_ctl(i64 addrspace(1)* %arg, i64 %in1) {
+define amdgpu_kernel void @dpp64_rcp_unsupported_ctl(ptr addrspace(1) %arg, i64 %in1) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
-  %load = load i64, i64 addrspace(1)* %gep
+  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
+  %load = load i64, ptr addrspace(1) %gep
   %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 15, i32 15, i1 1) #0
   %tmp1 = bitcast i64 %tmp0 to double
   %rcp = fdiv fast double 1.0, %tmp1
   %tmp2 = bitcast double %rcp to i64
-  store i64 %tmp2, i64 addrspace(1)* %gep
+  store i64 %tmp2, ptr addrspace(1) %gep
   ret void
 }
 
@@ -57,15 +57,15 @@ define amdgpu_kernel void @dpp64_rcp_unsupported_ctl(i64 addrspace(1)* %arg, i64
 ; GFX10PLUS-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
 ; GCN:               v_div_scale_f64
 ; GCN:               v_rcp_f64_e32
-define amdgpu_kernel void @dpp64_div(i64 addrspace(1)* %arg, i64 %in1) {
+define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id
-  %load = load i64, i64 addrspace(1)* %gep
+  %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
+  %load = load i64, ptr addrspace(1) %gep
   %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0
   %tmp1 = bitcast i64 %tmp0 to double
   %rcp = fdiv double 15.0, %tmp1
   %tmp2 = bitcast double %rcp to i64
-  store i64 %tmp2, i64 addrspace(1)* %gep
+  store i64 %tmp2, ptr addrspace(1) %gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
index d127f7342421..4d979e2cf749 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
@@ -5,44 +5,44 @@
 ; GCN-LABEL: {{^}}dpp_add:
 ; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
 ; GCN: v_add_{{(nc_)?}}u32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
-define amdgpu_kernel void @dpp_add(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @dpp_add(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
-  %load = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
+  %load = load i32, ptr addrspace(1) %gep
   %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0
   %add = add i32 %tmp0, %load
-  store i32 %add, i32 addrspace(1)* %gep
+  store i32 %add, ptr addrspace(1) %gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}dpp_ceil:
 ; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
 ; GCN: v_ceil_f32_dpp [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
-define amdgpu_kernel void @dpp_ceil(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @dpp_ceil(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
-  %load = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
+  %load = load i32, ptr addrspace(1) %gep
   %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0
   %tmp1 = bitcast i32 %tmp0 to float
   %round = tail call float @llvm.ceil.f32(float %tmp1)
   %tmp2 = bitcast float %round to i32
-  store i32 %tmp2, i32 addrspace(1)* %gep
+  store i32 %tmp2, ptr addrspace(1) %gep
   ret void
 }
 
 ; GCN-LABEL: {{^}}dpp_fadd:
 ; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
 ; GCN: v_add_f32_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
-define amdgpu_kernel void @dpp_fadd(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @dpp_fadd(ptr addrspace(1) %arg) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
-  %load = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
+  %load = load i32, ptr addrspace(1) %gep
   %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0
   %tmp1 = bitcast i32 %tmp0 to float
   %t = bitcast i32 %load to float
   %add = fadd float %tmp1, %t
   %tmp2 = bitcast float %add to i32
-  store i32 %tmp2, i32 addrspace(1)* %gep
+  store i32 %tmp2, ptr addrspace(1) %gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
index 59657a479398..b17a4c3c34ec 100644
--- a/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
@@ -9,29 +9,29 @@
 ; GCN: buffer_load_dword
 ; GCN: ds_write2_b32
 ; GCN: s_endpgm
-define amdgpu_kernel void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
+define amdgpu_kernel void @reschedule_global_load_lds_store(ptr addrspace(1) noalias %gptr0, ptr addrspace(1) noalias %gptr1, ptr addrspace(3) noalias %lptr, i32 %c) #0 {
 entry:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx = shl i32 %tid, 2
-  %gep0 = getelementptr i32, i32 addrspace(1)* %gptr0, i32 %idx
-  %gep1 = getelementptr i32, i32 addrspace(1)* %gptr1, i32 %idx
-  %gep2 = getelementptr i32, i32 addrspace(3)* %lptr, i32 %tid
+  %gep0 = getelementptr i32, ptr addrspace(1) %gptr0, i32 %idx
+  %gep1 = getelementptr i32, ptr addrspace(1) %gptr1, i32 %idx
+  %gep2 = getelementptr i32, ptr addrspace(3) %lptr, i32 %tid
   %cmp0 = icmp eq i32 %c, 0
   br i1 %cmp0, label %for.body, label %exit
 
 for.body:                                         ; preds = %for.body, %entry
   %i = phi i32 [ 0, %entry ], [ %i.inc, %for.body ]
-  %gptr0.phi = phi i32 addrspace(1)* [ %gep0, %entry ], [ %gep0.inc, %for.body ]
-  %gptr1.phi = phi i32 addrspace(1)* [ %gep1, %entry ], [ %gep1.inc, %for.body ]
-  %lptr0.phi = phi i32 addrspace(3)* [ %gep2, %entry ], [ %gep2.inc, %for.body ]
-  %lptr1 = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 2
-  %val0 = load i32, i32 addrspace(1)* %gep0
-  store i32 %val0, i32 addrspace(3)* %lptr0.phi
-  %val1 = load i32, i32 addrspace(1)* %gep1
-  store i32 %val1, i32 addrspace(3)* %lptr1
-  %gep0.inc = getelementptr i32, i32 addrspace(1)* %gptr0.phi, i32 4
-  %gep1.inc = getelementptr i32, i32 addrspace(1)* %gptr1.phi, i32 4
-  %gep2.inc = getelementptr i32, i32 addrspace(3)* %lptr0.phi, i32 4
+  %gptr0.phi = phi ptr addrspace(1) [ %gep0, %entry ], [ %gep0.inc, %for.body ]
+  %gptr1.phi = phi ptr addrspace(1) [ %gep1, %entry ], [ %gep1.inc, %for.body ]
+  %lptr0.phi = phi ptr addrspace(3) [ %gep2, %entry ], [ %gep2.inc, %for.body ]
+  %lptr1 = getelementptr i32, ptr addrspace(3) %lptr0.phi, i32 2
+  %val0 = load i32, ptr addrspace(1) %gep0
+  store i32 %val0, ptr addrspace(3) %lptr0.phi
+  %val1 = load i32, ptr addrspace(1) %gep1
+  store i32 %val1, ptr addrspace(3) %lptr1
+  %gep0.inc = getelementptr i32, ptr addrspace(1) %gptr0.phi, i32 4
+  %gep1.inc = getelementptr i32, ptr addrspace(1) %gptr1.phi, i32 4
+  %gep2.inc = getelementptr i32, ptr addrspace(3) %lptr0.phi, i32 4
   %i.inc = add nsw i32 %i, 1
   %cmp1 = icmp ne i32 %i, 256
   br i1 %cmp1, label %for.body, label %exit

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index a05b76440878..c60da23ebb4b 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-SDAG
 ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-GISEL
 
-define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
+define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; GCN-LABEL: ds1align1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -15,12 +15,12 @@ define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_write_b8 v1, v0
 ; GCN-NEXT:    s_endpgm
-  %val = load i8, i8 addrspace(3)* %in, align 1
-  store i8 %val, i8 addrspace(3)* %out, align 1
+  %val = load i8, ptr addrspace(3) %in, align 1
+  store i8 %val, ptr addrspace(3) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
+define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-SDAG-LABEL: ds2align1:
 ; ALIGNED-SDAG:       ; %bb.0:
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -60,12 +60,12 @@ define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %o
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-NEXT:    ds_write_b16 v1, v0
 ; UNALIGNED-NEXT:    s_endpgm
-  %val = load i16, i16 addrspace(3)* %in, align 1
-  store i16 %val, i16 addrspace(3)* %out, align 1
+  %val = load i16, ptr addrspace(3) %in, align 1
+  store i16 %val, ptr addrspace(3) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
+define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; GCN-LABEL: ds2align2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -76,12 +76,12 @@ define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %o
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_write_b16 v1, v0
 ; GCN-NEXT:    s_endpgm
-  %val = load i16, i16 addrspace(3)* %in, align 2
-  store i16 %val, i16 addrspace(3)* %out, align 2
+  %val = load i16, ptr addrspace(3) %in, align 2
+  store i16 %val, ptr addrspace(3) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-SDAG-LABEL: ds4align1:
 ; ALIGNED-SDAG:       ; %bb.0:
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -138,12 +138,12 @@ define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %o
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-NEXT:    ds_write_b32 v1, v0
 ; UNALIGNED-NEXT:    s_endpgm
-  %val = load i32, i32 addrspace(3)* %in, align 1
-  store i32 %val, i32 addrspace(3)* %out, align 1
+  %val = load i32, ptr addrspace(3) %in, align 1
+  store i32 %val, ptr addrspace(3) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-SDAG-LABEL: ds4align2:
 ; ALIGNED-SDAG:       ; %bb.0:
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -182,12 +182,12 @@ define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %o
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-NEXT:    ds_write_b32 v1, v0
 ; UNALIGNED-NEXT:    s_endpgm
-  %val = load i32, i32 addrspace(3)* %in, align 2
-  store i32 %val, i32 addrspace(3)* %out, align 2
+  %val = load i32, ptr addrspace(3) %in, align 2
+  store i32 %val, ptr addrspace(3) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; GCN-LABEL: ds4align4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -198,12 +198,12 @@ define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %o
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_write_b32 v1, v0
 ; GCN-NEXT:    s_endpgm
-  %val = load i32, i32 addrspace(3)* %in, align 4
-  store i32 %val, i32 addrspace(3)* %out, align 4
+  %val = load i32, ptr addrspace(3) %in, align 4
+  store i32 %val, ptr addrspace(3) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-SDAG-LABEL: ds8align1:
 ; ALIGNED-SDAG:       ; %bb.0:
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -283,12 +283,12 @@ define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addr
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1]
 ; UNALIGNED-NEXT:    s_endpgm
-  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1
-  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1
+  %val = load <2 x i32>, ptr addrspace(3) %in, align 1
+  store <2 x i32> %val, ptr addrspace(3) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-SDAG-LABEL: ds8align2:
 ; ALIGNED-SDAG:       ; %bb.0:
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -339,12 +339,12 @@ define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addr
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1]
 ; UNALIGNED-NEXT:    s_endpgm
-  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2
-  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2
+  %val = load <2 x i32>, ptr addrspace(3) %in, align 2
+  store <2 x i32> %val, ptr addrspace(3) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; GCN-LABEL: ds8align4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -355,12 +355,12 @@ define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addr
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
 ; GCN-NEXT:    s_endpgm
-  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
-  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4
+  %val = load <2 x i32>, ptr addrspace(3) %in, align 4
+  store <2 x i32> %val, ptr addrspace(3) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; GCN-LABEL: ds8align8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -371,12 +371,12 @@ define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addr
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_write_b64 v2, v[0:1]
 ; GCN-NEXT:    s_endpgm
-  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8
-  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8
+  %val = load <2 x i32>, ptr addrspace(3) %in, align 8
+  store <2 x i32> %val, ptr addrspace(3) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-SDAG-LABEL: ds12align1:
 ; ALIGNED-SDAG:       ; %bb.0:
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -483,12 +483,12 @@ define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
 ; UNALIGNED-NEXT:    s_endpgm
-  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1
-  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1
+  %val = load <3 x i32>, ptr addrspace(3) %in, align 1
+  store <3 x i32> %val, ptr addrspace(3) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-SDAG-LABEL: ds12align2:
 ; ALIGNED-SDAG:       ; %bb.0:
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -549,12 +549,12 @@ define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
 ; UNALIGNED-NEXT:    s_endpgm
-  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2
-  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2
+  %val = load <3 x i32>, ptr addrspace(3) %in, align 2
+  store <3 x i32> %val, ptr addrspace(3) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-LABEL: ds12align4:
 ; ALIGNED:       ; %bb.0:
 ; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -593,12 +593,12 @@ define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; UNALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-GISEL-NEXT:    ds_write_b96 v3, v[0:2]
 ; UNALIGNED-GISEL-NEXT:    s_endpgm
-  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4
-  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4
+  %val = load <3 x i32>, ptr addrspace(3) %in, align 4
+  store <3 x i32> %val, ptr addrspace(3) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-SDAG-LABEL: ds12align8:
 ; ALIGNED-SDAG:       ; %bb.0:
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -651,12 +651,12 @@ define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; UNALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-GISEL-NEXT:    ds_write_b96 v3, v[0:2]
 ; UNALIGNED-GISEL-NEXT:    s_endpgm
-  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8
-  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8
+  %val = load <3 x i32>, ptr addrspace(3) %in, align 8
+  store <3 x i32> %val, ptr addrspace(3) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; GCN-LABEL: ds12align16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -667,12 +667,12 @@ define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> ad
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_write_b96 v3, v[0:2]
 ; GCN-NEXT:    s_endpgm
-  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16
-  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16
+  %val = load <3 x i32>, ptr addrspace(3) %in, align 16
+  store <3 x i32> %val, ptr addrspace(3) %out, align 16
   ret void
 }
 
-define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-SDAG-LABEL: ds16align1:
 ; ALIGNED-SDAG:       ; %bb.0:
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -805,12 +805,12 @@ define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> add
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-NEXT:    ds_write_b128 v4, v[0:3]
 ; UNALIGNED-NEXT:    s_endpgm
-  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1
-  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1
+  %val = load <4 x i32>, ptr addrspace(3) %in, align 1
+  store <4 x i32> %val, ptr addrspace(3) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-SDAG-LABEL: ds16align2:
 ; ALIGNED-SDAG:       ; %bb.0:
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -881,12 +881,12 @@ define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> add
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-NEXT:    ds_write_b128 v4, v[0:3]
 ; UNALIGNED-NEXT:    s_endpgm
-  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2
-  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2
+  %val = load <4 x i32>, ptr addrspace(3) %in, align 2
+  store <4 x i32> %val, ptr addrspace(3) %out, align 2
   ret void
 }
 
-define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; ALIGNED-LABEL: ds16align4:
 ; ALIGNED:       ; %bb.0:
 ; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -925,12 +925,12 @@ define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> add
 ; UNALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-GISEL-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
 ; UNALIGNED-GISEL-NEXT:    s_endpgm
-  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4
-  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4
+  %val = load <4 x i32>, ptr addrspace(3) %in, align 4
+  store <4 x i32> %val, ptr addrspace(3) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; GCN-LABEL: ds16align8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -941,12 +941,12 @@ define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> add
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
 ; GCN-NEXT:    s_endpgm
-  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8
-  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8
+  %val = load <4 x i32>, ptr addrspace(3) %in, align 8
+  store <4 x i32> %val, ptr addrspace(3) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
+define amdgpu_kernel void @ds16align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
 ; GCN-LABEL: ds16align16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -957,7 +957,7 @@ define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> ad
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_write_b128 v4, v[0:3]
 ; GCN-NEXT:    s_endpgm
-  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16
-  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16
+  %val = load <4 x i32>, ptr addrspace(3) %in, align 16
+  store <4 x i32> %val, ptr addrspace(3) %out, align 16
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll
index 131b8e3aa2d9..aa1d44c31606 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll
@@ -17,32 +17,32 @@
 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:72 offset1:172
 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B2]] offset0:144 offset1:244
 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B3]] offset0:88 offset1:188
-define amdgpu_kernel void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
+define amdgpu_kernel void @ds_read32_combine_stride_400(ptr addrspace(3) nocapture readonly %arg, ptr nocapture %arg1) {
 bb:
-  %tmp = load float, float addrspace(3)* %arg, align 4
+  %tmp = load float, ptr addrspace(3) %arg, align 4
   %tmp2 = fadd float %tmp, 0.000000e+00
-  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
-  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 100
+  %tmp4 = load float, ptr addrspace(3) %tmp3, align 4
   %tmp5 = fadd float %tmp2, %tmp4
-  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
-  %tmp7 = load float, float addrspace(3)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 200
+  %tmp7 = load float, ptr addrspace(3) %tmp6, align 4
   %tmp8 = fadd float %tmp5, %tmp7
-  %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
-  %tmp10 = load float, float addrspace(3)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 300
+  %tmp10 = load float, ptr addrspace(3) %tmp9, align 4
   %tmp11 = fadd float %tmp8, %tmp10
-  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
-  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
+  %tmp12 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 400
+  %tmp13 = load float, ptr addrspace(3) %tmp12, align 4
   %tmp14 = fadd float %tmp11, %tmp13
-  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
-  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
+  %tmp15 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 500
+  %tmp16 = load float, ptr addrspace(3) %tmp15, align 4
   %tmp17 = fadd float %tmp14, %tmp16
-  %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
-  %tmp19 = load float, float addrspace(3)* %tmp18, align 4
+  %tmp18 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 600
+  %tmp19 = load float, ptr addrspace(3) %tmp18, align 4
   %tmp20 = fadd float %tmp17, %tmp19
-  %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
-  %tmp22 = load float, float addrspace(3)* %tmp21, align 4
+  %tmp21 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 700
+  %tmp22 = load float, ptr addrspace(3) %tmp21, align 4
   %tmp23 = fadd float %tmp20, %tmp22
-  store float %tmp23, float *%arg1, align 4
+  store float %tmp23, ptr %arg1, align 4
   ret void
 }
 
@@ -60,33 +60,33 @@ bb:
 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:184 offset1:204
 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:224 offset1:244
 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B2]] offset0:8 offset1:28
-define amdgpu_kernel void @ds_read32_combine_stride_20(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
+define amdgpu_kernel void @ds_read32_combine_stride_20(ptr addrspace(3) nocapture readonly %arg, ptr nocapture %arg1) {
 bb:
-  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
-  %tmp1 = load float, float addrspace(3)* %tmp, align 4
+  %tmp = getelementptr inbounds float, ptr addrspace(3) %arg, i32 400
+  %tmp1 = load float, ptr addrspace(3) %tmp, align 4
   %tmp2 = fadd float %tmp1, 0.000000e+00
-  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 420
-  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 420
+  %tmp4 = load float, ptr addrspace(3) %tmp3, align 4
   %tmp5 = fadd float %tmp2, %tmp4
-  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 440
-  %tmp7 = load float, float addrspace(3)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 440
+  %tmp7 = load float, ptr addrspace(3) %tmp6, align 4
   %tmp8 = fadd float %tmp5, %tmp7
-  %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 460
-  %tmp10 = load float, float addrspace(3)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 460
+  %tmp10 = load float, ptr addrspace(3) %tmp9, align 4
   %tmp11 = fadd float %tmp8, %tmp10
-  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 480
-  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
+  %tmp12 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 480
+  %tmp13 = load float, ptr addrspace(3) %tmp12, align 4
   %tmp14 = fadd float %tmp11, %tmp13
-  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
-  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
+  %tmp15 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 500
+  %tmp16 = load float, ptr addrspace(3) %tmp15, align 4
   %tmp17 = fadd float %tmp14, %tmp16
-  %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 520
-  %tmp19 = load float, float addrspace(3)* %tmp18, align 4
+  %tmp18 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 520
+  %tmp19 = load float, ptr addrspace(3) %tmp18, align 4
   %tmp20 = fadd float %tmp17, %tmp19
-  %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 540
-  %tmp22 = load float, float addrspace(3)* %tmp21, align 4
+  %tmp21 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 540
+  %tmp22 = load float, ptr addrspace(3) %tmp21, align 4
   %tmp23 = fadd float %tmp20, %tmp22
-  store float %tmp23, float *%arg1, align 4
+  store float %tmp23, ptr %arg1, align 4
   ret void
 }
 
@@ -106,32 +106,32 @@ bb:
 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:88 offset1:188
 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B2]] offset0:144 offset1:244
 ; GCN-DAG: ds_read2_b32  v[{{[0-9]+:[0-9]+}}], [[B3]] offset0:72 offset1:172
-define amdgpu_kernel void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
+define amdgpu_kernel void @ds_read32_combine_stride_400_back(ptr addrspace(3) nocapture readonly %arg, ptr nocapture %arg1) {
 bb:
-  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
-  %tmp2 = load float, float addrspace(3)* %tmp, align 4
+  %tmp = getelementptr inbounds float, ptr addrspace(3) %arg, i32 700
+  %tmp2 = load float, ptr addrspace(3) %tmp, align 4
   %tmp3 = fadd float %tmp2, 0.000000e+00
-  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
-  %tmp5 = load float, float addrspace(3)* %tmp4, align 4
+  %tmp4 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 600
+  %tmp5 = load float, ptr addrspace(3) %tmp4, align 4
   %tmp6 = fadd float %tmp3, %tmp5
-  %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
-  %tmp8 = load float, float addrspace(3)* %tmp7, align 4
+  %tmp7 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 500
+  %tmp8 = load float, ptr addrspace(3) %tmp7, align 4
   %tmp9 = fadd float %tmp6, %tmp8
-  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
-  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
+  %tmp10 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 400
+  %tmp11 = load float, ptr addrspace(3) %tmp10, align 4
   %tmp12 = fadd float %tmp9, %tmp11
-  %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
-  %tmp14 = load float, float addrspace(3)* %tmp13, align 4
+  %tmp13 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 300
+  %tmp14 = load float, ptr addrspace(3) %tmp13, align 4
   %tmp15 = fadd float %tmp12, %tmp14
-  %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
-  %tmp17 = load float, float addrspace(3)* %tmp16, align 4
+  %tmp16 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 200
+  %tmp17 = load float, ptr addrspace(3) %tmp16, align 4
   %tmp18 = fadd float %tmp15, %tmp17
-  %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
-  %tmp20 = load float, float addrspace(3)* %tmp19, align 4
+  %tmp19 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 100
+  %tmp20 = load float, ptr addrspace(3) %tmp19, align 4
   %tmp21 = fadd float %tmp18, %tmp20
-  %tmp22 = load float, float addrspace(3)* %arg, align 4
+  %tmp22 = load float, ptr addrspace(3) %arg, align 4
   %tmp23 = fadd float %tmp21, %tmp22
-  store float %tmp23, float *%arg1, align 4
+  store float %tmp23, ptr %arg1, align 4
   ret void
 }
 
@@ -142,32 +142,32 @@ bb:
 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:64 offset1:96
 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:128 offset1:160
 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:192 offset1:224
-define amdgpu_kernel void @ds_read32_combine_stride_8192(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
+define amdgpu_kernel void @ds_read32_combine_stride_8192(ptr addrspace(3) nocapture readonly %arg, ptr nocapture %arg1) {
 bb:
-  %tmp = load float, float addrspace(3)* %arg, align 4
+  %tmp = load float, ptr addrspace(3) %arg, align 4
   %tmp2 = fadd float %tmp, 0.000000e+00
-  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
-  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 2048
+  %tmp4 = load float, ptr addrspace(3) %tmp3, align 4
   %tmp5 = fadd float %tmp2, %tmp4
-  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
-  %tmp7 = load float, float addrspace(3)* %tmp6, align 4
+  %tmp6 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 4096
+  %tmp7 = load float, ptr addrspace(3) %tmp6, align 4
   %tmp8 = fadd float %tmp5, %tmp7
-  %tmp9 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
-  %tmp10 = load float, float addrspace(3)* %tmp9, align 4
+  %tmp9 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 6144
+  %tmp10 = load float, ptr addrspace(3) %tmp9, align 4
   %tmp11 = fadd float %tmp8, %tmp10
-  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
-  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
+  %tmp12 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 8192
+  %tmp13 = load float, ptr addrspace(3) %tmp12, align 4
   %tmp14 = fadd float %tmp11, %tmp13
-  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
-  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
+  %tmp15 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 10240
+  %tmp16 = load float, ptr addrspace(3) %tmp15, align 4
   %tmp17 = fadd float %tmp14, %tmp16
-  %tmp18 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
-  %tmp19 = load float, float addrspace(3)* %tmp18, align 4
+  %tmp18 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 12288
+  %tmp19 = load float, ptr addrspace(3) %tmp18, align 4
   %tmp20 = fadd float %tmp17, %tmp19
-  %tmp21 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
-  %tmp22 = load float, float addrspace(3)* %tmp21, align 4
+  %tmp21 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 14336
+  %tmp22 = load float, ptr addrspace(3) %tmp21, align 4
   %tmp23 = fadd float %tmp20, %tmp22
-  store float %tmp23, float *%arg1, align 4
+  store float %tmp23, ptr %arg1, align 4
   ret void
 }
 
@@ -181,27 +181,27 @@ bb:
 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32
 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:64 offset1:96
 ; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:128 offset1:160
-define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
+define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(ptr addrspace(3) nocapture readonly %arg, ptr nocapture %arg1) {
 bb:
-  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
-  %tmp2 = load float, float addrspace(3)* %tmp, align 4
+  %tmp = getelementptr inbounds float, ptr addrspace(3) %arg, i32 2
+  %tmp2 = load float, ptr addrspace(3) %tmp, align 4
   %tmp3 = fadd float %tmp2, 0.000000e+00
-  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2050
-  %tmp5 = load float, float addrspace(3)* %tmp4, align 4
+  %tmp4 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 2050
+  %tmp5 = load float, ptr addrspace(3) %tmp4, align 4
   %tmp6 = fadd float %tmp3, %tmp5
-  %tmp7 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4098
-  %tmp8 = load float, float addrspace(3)* %tmp7, align 4
+  %tmp7 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 4098
+  %tmp8 = load float, ptr addrspace(3) %tmp7, align 4
   %tmp9 = fadd float %tmp6, %tmp8
-  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6146
-  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
+  %tmp10 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 6146
+  %tmp11 = load float, ptr addrspace(3) %tmp10, align 4
   %tmp12 = fadd float %tmp9, %tmp11
-  %tmp13 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8194
-  %tmp14 = load float, float addrspace(3)* %tmp13, align 4
+  %tmp13 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 8194
+  %tmp14 = load float, ptr addrspace(3) %tmp13, align 4
   %tmp15 = fadd float %tmp12, %tmp14
-  %tmp16 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10242
-  %tmp17 = load float, float addrspace(3)* %tmp16, align 4
+  %tmp16 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 10242
+  %tmp17 = load float, ptr addrspace(3) %tmp16, align 4
   %tmp18 = fadd float %tmp15, %tmp17
-  store float %tmp18, float *%arg1, align 4
+  store float %tmp18, ptr %arg1, align 4
   ret void
 }
 
@@ -216,32 +216,32 @@ bb:
 ; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150
 ; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250
 ; GCN-DAG: ds_read2_b64  v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:44 offset1:94
-define amdgpu_kernel void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
+define amdgpu_kernel void @ds_read64_combine_stride_400(ptr addrspace(3) nocapture readonly %arg, ptr nocapture %arg1) {
 bb:
-  %tmp = load double, double addrspace(3)* %arg, align 8
+  %tmp = load double, ptr addrspace(3) %arg, align 8
   %tmp2 = fadd double %tmp, 0.000000e+00
-  %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
-  %tmp4 = load double, double addrspace(3)* %tmp3, align 8
+  %tmp3 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 50
+  %tmp4 = load double, ptr addrspace(3) %tmp3, align 8
   %tmp5 = fadd double %tmp2, %tmp4
-  %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
-  %tmp7 = load double, double addrspace(3)* %tmp6, align 8
+  %tmp6 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 100
+  %tmp7 = load double, ptr addrspace(3) %tmp6, align 8
   %tmp8 = fadd double %tmp5, %tmp7
-  %tmp9 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
-  %tmp10 = load double, double addrspace(3)* %tmp9, align 8
+  %tmp9 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 150
+  %tmp10 = load double, ptr addrspace(3) %tmp9, align 8
   %tmp11 = fadd double %tmp8, %tmp10
-  %tmp12 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
-  %tmp13 = load double, double addrspace(3)* %tmp12, align 8
+  %tmp12 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 200
+  %tmp13 = load double, ptr addrspace(3) %tmp12, align 8
   %tmp14 = fadd double %tmp11, %tmp13
-  %tmp15 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
-  %tmp16 = load double, double addrspace(3)* %tmp15, align 8
+  %tmp15 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 250
+  %tmp16 = load double, ptr addrspace(3) %tmp15, align 8
   %tmp17 = fadd double %tmp14, %tmp16
-  %tmp18 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
-  %tmp19 = load double, double addrspace(3)* %tmp18, align 8
+  %tmp18 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 300
+  %tmp19 = load double, ptr addrspace(3) %tmp18, align 8
   %tmp20 = fadd double %tmp17, %tmp19
-  %tmp21 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
-  %tmp22 = load double, double addrspace(3)* %tmp21, align 8
+  %tmp21 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 350
+  %tmp22 = load double, ptr addrspace(3) %tmp21, align 8
   %tmp23 = fadd double %tmp20, %tmp22
-  store double %tmp23, double *%arg1, align 8
+  store double %tmp23, ptr %arg1, align 8
   ret void
 }
 
@@ -255,27 +255,27 @@ bb:
 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16
 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:32 offset1:48
 ; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:64 offset1:80
-define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
+define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(ptr addrspace(3) nocapture readonly %arg, ptr nocapture %arg1) {
 bb:
-  %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
-  %tmp2 = load double, double addrspace(3)* %tmp, align 8
+  %tmp = getelementptr inbounds double, ptr addrspace(3) %arg, i32 1
+  %tmp2 = load double, ptr addrspace(3) %tmp, align 8
   %tmp3 = fadd double %tmp2, 0.000000e+00
-  %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
-  %tmp5 = load double, double addrspace(3)* %tmp4, align 8
+  %tmp4 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 1025
+  %tmp5 = load double, ptr addrspace(3) %tmp4, align 8
   %tmp6 = fadd double %tmp3, %tmp5
-  %tmp7 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
-  %tmp8 = load double, double addrspace(3)* %tmp7, align 8
+  %tmp7 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 2049
+  %tmp8 = load double, ptr addrspace(3) %tmp7, align 8
   %tmp9 = fadd double %tmp6, %tmp8
-  %tmp10 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
-  %tmp11 = load double, double addrspace(3)* %tmp10, align 8
+  %tmp10 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 3073
+  %tmp11 = load double, ptr addrspace(3) %tmp10, align 8
   %tmp12 = fadd double %tmp9, %tmp11
-  %tmp13 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
-  %tmp14 = load double, double addrspace(3)* %tmp13, align 8
+  %tmp13 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 4097
+  %tmp14 = load double, ptr addrspace(3) %tmp13, align 8
   %tmp15 = fadd double %tmp12, %tmp14
-  %tmp16 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
-  %tmp17 = load double, double addrspace(3)* %tmp16, align 8
+  %tmp16 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 5121
+  %tmp17 = load double, ptr addrspace(3) %tmp16, align 8
   %tmp18 = fadd double %tmp15, %tmp17
-  store double %tmp18, double *%arg1, align 8
+  store double %tmp18, ptr %arg1, align 8
   ret void
 }
 
@@ -295,23 +295,23 @@ bb:
 ; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset0:72 offset1:172
 ; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset0:144 offset1:244
 ; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset0:88 offset1:188
-define amdgpu_kernel void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) {
+define amdgpu_kernel void @ds_write32_combine_stride_400(ptr addrspace(3) nocapture %arg) {
 bb:
-  store float 1.000000e+00, float addrspace(3)* %arg, align 4
-  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
-  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
-  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
-  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
-  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
-  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
-  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
-  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
-  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
-  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
-  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
-  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
-  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
-  store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
+  store float 1.000000e+00, ptr addrspace(3) %arg, align 4
+  %tmp = getelementptr inbounds float, ptr addrspace(3) %arg, i32 100
+  store float 1.000000e+00, ptr addrspace(3) %tmp, align 4
+  %tmp1 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 200
+  store float 1.000000e+00, ptr addrspace(3) %tmp1, align 4
+  %tmp2 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 300
+  store float 1.000000e+00, ptr addrspace(3) %tmp2, align 4
+  %tmp3 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 400
+  store float 1.000000e+00, ptr addrspace(3) %tmp3, align 4
+  %tmp4 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 500
+  store float 1.000000e+00, ptr addrspace(3) %tmp4, align 4
+  %tmp5 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 600
+  store float 1.000000e+00, ptr addrspace(3) %tmp5, align 4
+  %tmp6 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 700
+  store float 1.000000e+00, ptr addrspace(3) %tmp6, align 4
   ret void
 }
 
@@ -331,23 +331,23 @@ bb:
 ; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset0:144 offset1:244
 ; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset0:72 offset1:172
 ; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
-define amdgpu_kernel void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) {
+define amdgpu_kernel void @ds_write32_combine_stride_400_back(ptr addrspace(3) nocapture %arg) {
 bb:
-  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
-  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
-  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 600
-  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
-  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 500
-  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
-  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
-  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
-  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 300
-  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
-  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 200
-  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
-  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 100
-  store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
-  store float 1.000000e+00, float addrspace(3)* %arg, align 4
+  %tmp = getelementptr inbounds float, ptr addrspace(3) %arg, i32 700
+  store float 1.000000e+00, ptr addrspace(3) %tmp, align 4
+  %tmp1 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 600
+  store float 1.000000e+00, ptr addrspace(3) %tmp1, align 4
+  %tmp2 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 500
+  store float 1.000000e+00, ptr addrspace(3) %tmp2, align 4
+  %tmp3 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 400
+  store float 1.000000e+00, ptr addrspace(3) %tmp3, align 4
+  %tmp4 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 300
+  store float 1.000000e+00, ptr addrspace(3) %tmp4, align 4
+  %tmp5 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 200
+  store float 1.000000e+00, ptr addrspace(3) %tmp5, align 4
+  %tmp6 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 100
+  store float 1.000000e+00, ptr addrspace(3) %tmp6, align 4
+  store float 1.000000e+00, ptr addrspace(3) %arg, align 4
   ret void
 }
 
@@ -358,23 +358,23 @@ bb:
 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96
 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160
 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:192 offset1:224
-define amdgpu_kernel void @ds_write32_combine_stride_8192(float addrspace(3)* nocapture %arg) {
+define amdgpu_kernel void @ds_write32_combine_stride_8192(ptr addrspace(3) nocapture %arg) {
 bb:
-  store float 1.000000e+00, float addrspace(3)* %arg, align 4
-  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2048
-  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
-  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4096
-  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
-  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6144
-  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
-  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8192
-  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
-  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10240
-  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
-  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 12288
-  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
-  %tmp6 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14336
-  store float 1.000000e+00, float addrspace(3)* %tmp6, align 4
+  store float 1.000000e+00, ptr addrspace(3) %arg, align 4
+  %tmp = getelementptr inbounds float, ptr addrspace(3) %arg, i32 2048
+  store float 1.000000e+00, ptr addrspace(3) %tmp, align 4
+  %tmp1 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 4096
+  store float 1.000000e+00, ptr addrspace(3) %tmp1, align 4
+  %tmp2 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 6144
+  store float 1.000000e+00, ptr addrspace(3) %tmp2, align 4
+  %tmp3 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 8192
+  store float 1.000000e+00, ptr addrspace(3) %tmp3, align 4
+  %tmp4 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 10240
+  store float 1.000000e+00, ptr addrspace(3) %tmp4, align 4
+  %tmp5 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 12288
+  store float 1.000000e+00, ptr addrspace(3) %tmp5, align 4
+  %tmp6 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 14336
+  store float 1.000000e+00, ptr addrspace(3) %tmp6, align 4
   ret void
 }
 
@@ -388,20 +388,20 @@ bb:
 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96
 ; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160
-define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) {
+define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(ptr addrspace(3) nocapture %arg) {
 bb:
-  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
-  store float 1.000000e+00, float addrspace(3)* %tmp, align 4
-  %tmp1 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2049
-  store float 1.000000e+00, float addrspace(3)* %tmp1, align 4
-  %tmp2 = getelementptr inbounds float, float addrspace(3)* %arg, i32 4097
-  store float 1.000000e+00, float addrspace(3)* %tmp2, align 4
-  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6145
-  store float 1.000000e+00, float addrspace(3)* %tmp3, align 4
-  %tmp4 = getelementptr inbounds float, float addrspace(3)* %arg, i32 8193
-  store float 1.000000e+00, float addrspace(3)* %tmp4, align 4
-  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10241
-  store float 1.000000e+00, float addrspace(3)* %tmp5, align 4
+  %tmp = getelementptr inbounds float, ptr addrspace(3) %arg, i32 1
+  store float 1.000000e+00, ptr addrspace(3) %tmp, align 4
+  %tmp1 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 2049
+  store float 1.000000e+00, ptr addrspace(3) %tmp1, align 4
+  %tmp2 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 4097
+  store float 1.000000e+00, ptr addrspace(3) %tmp2, align 4
+  %tmp3 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 6145
+  store float 1.000000e+00, ptr addrspace(3) %tmp3, align 4
+  %tmp4 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 8193
+  store float 1.000000e+00, ptr addrspace(3) %tmp4, align 4
+  %tmp5 = getelementptr inbounds float, ptr addrspace(3) %arg, i32 10241
+  store float 1.000000e+00, ptr addrspace(3) %tmp5, align 4
   ret void
 }
 
@@ -416,23 +416,23 @@ bb:
 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150
 ; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250
 ; GCN-DAG: ds_write2_b64 [[B1]],   v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:44 offset1:94
-define amdgpu_kernel void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) {
+define amdgpu_kernel void @ds_write64_combine_stride_400(ptr addrspace(3) nocapture %arg) {
 bb:
-  store double 1.000000e+00, double addrspace(3)* %arg, align 8
-  %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 50
-  store double 1.000000e+00, double addrspace(3)* %tmp, align 8
-  %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 100
-  store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
-  %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 150
-  store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
-  %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 200
-  store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
-  %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 250
-  store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
-  %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 300
-  store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
-  %tmp6 = getelementptr inbounds double, double addrspace(3)* %arg, i32 350
-  store double 1.000000e+00, double addrspace(3)* %tmp6, align 8
+  store double 1.000000e+00, ptr addrspace(3) %arg, align 8
+  %tmp = getelementptr inbounds double, ptr addrspace(3) %arg, i32 50
+  store double 1.000000e+00, ptr addrspace(3) %tmp, align 8
+  %tmp1 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 100
+  store double 1.000000e+00, ptr addrspace(3) %tmp1, align 8
+  %tmp2 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 150
+  store double 1.000000e+00, ptr addrspace(3) %tmp2, align 8
+  %tmp3 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 200
+  store double 1.000000e+00, ptr addrspace(3) %tmp3, align 8
+  %tmp4 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 250
+  store double 1.000000e+00, ptr addrspace(3) %tmp4, align 8
+  %tmp5 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 300
+  store double 1.000000e+00, ptr addrspace(3) %tmp5, align 8
+  %tmp6 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 350
+  store double 1.000000e+00, ptr addrspace(3) %tmp6, align 8
   ret void
 }
 
@@ -446,19 +446,19 @@ bb:
 ; GCN-DAG: ds_write2st64_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
 ; GCN-DAG: ds_write2st64_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:32 offset1:48
 ; GCN-DAG: ds_write2st64_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:64 offset1:80
-define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) {
+define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(ptr addrspace(3) nocapture %arg) {
 bb:
-  %tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
-  store double 1.000000e+00, double addrspace(3)* %tmp, align 8
-  %tmp1 = getelementptr inbounds double, double addrspace(3)* %arg, i32 1025
-  store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
-  %tmp2 = getelementptr inbounds double, double addrspace(3)* %arg, i32 2049
-  store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
-  %tmp3 = getelementptr inbounds double, double addrspace(3)* %arg, i32 3073
-  store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
-  %tmp4 = getelementptr inbounds double, double addrspace(3)* %arg, i32 4097
-  store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
-  %tmp5 = getelementptr inbounds double, double addrspace(3)* %arg, i32 5121
-  store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
+  %tmp = getelementptr inbounds double, ptr addrspace(3) %arg, i32 1
+  store double 1.000000e+00, ptr addrspace(3) %tmp, align 8
+  %tmp1 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 1025
+  store double 1.000000e+00, ptr addrspace(3) %tmp1, align 8
+  %tmp2 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 2049
+  store double 1.000000e+00, ptr addrspace(3) %tmp2, align 8
+  %tmp3 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 3073
+  store double 1.000000e+00, ptr addrspace(3) %tmp3, align 8
+  %tmp4 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 4097
+  store double 1.000000e+00, ptr addrspace(3) %tmp4, align 8
+  %tmp5 = getelementptr inbounds double, ptr addrspace(3) %arg, i32 5121
+  store double 1.000000e+00, ptr addrspace(3) %tmp5, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
index 0f4b2778b591..201765a236bd 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
@@ -9,27 +9,23 @@
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8
 ; GCN: s_waitcnt lgkmcnt({{[0-9]+}})
-define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace(3) %inptr) {
 
-  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
-  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
-  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
-  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
-  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+  %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 24
+  %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4
   %v0 = extractelement <3 x float> %load0, i32 2
 
   %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
   %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
 
-  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
-  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
-  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+  %tmp2 = getelementptr float, ptr addrspace(3) %inptr, i32 26
+  store <2 x float> %data, ptr addrspace(3) %tmp2, align 4
 
-  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
-  %v1 = load float, float addrspace(3)* %vaddr1, align 4
+  %vaddr1 = getelementptr float, ptr addrspace(3) %inptr, i32 7
+  %v1 = load float, ptr addrspace(3) %vaddr1, align 4
 
   %sum = fadd float %v0, %v1
-  store float %sum, float addrspace(1)* %out, align 4
+  store float %sum, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -41,27 +37,23 @@ define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addr
 
 ; GCN:      ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27
 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
-define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3) %inptr) {
 
-  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
-  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
-  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
-  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
-  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+  %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100
+  %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4
   %v0 = extractelement <3 x float> %load0, i32 2
 
   %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
   %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
 
-  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
-  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
-  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+  %tmp2 = getelementptr float, ptr addrspace(3) %inptr, i32 26
+  store <2 x float> %data, ptr addrspace(3) %tmp2, align 4
 
-  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
-  %v1 = load float, float addrspace(3)* %vaddr1, align 4
+  %vaddr1 = getelementptr float, ptr addrspace(3) %inptr, i32 7
+  %v1 = load float, ptr addrspace(3) %vaddr1, align 4
 
   %sum = fadd float %v0, %v1
-  store float %sum, float addrspace(1)* %out, align 4
+  store float %sum, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -75,27 +67,23 @@ define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrsp
 ; GCN:      ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
-define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+define amdgpu_kernel void @ds_combine_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) {
 
-  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
-  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
-  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
-  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
-  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+  %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 24
+  %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4
   %v0 = extractelement <3 x float> %load0, i32 2
 
   %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
   %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
 
-  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
-  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
-  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+  %tmp2 = getelementptr float, ptr addrspace(3) %inptr, i32 26
+  store <2 x float> %data, ptr addrspace(3) %tmp2, align 4
 
-  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
-  %v1 = load float, float addrspace(3)* %vaddr1, align 4
+  %vaddr1 = getelementptr float, ptr addrspace(3) %inptr, i32 26
+  %v1 = load float, ptr addrspace(3) %vaddr1, align 4
 
   %sum = fadd float %v0, %v1
-  store float %sum, float addrspace(1)* %out, align 4
+  store float %sum, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -108,26 +96,22 @@ define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrsp
 ; GCN:      ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108
 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
-define amdgpu_kernel void @ds_combine_WAR_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+define amdgpu_kernel void @ds_combine_WAR_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) {
 
-  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
-  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
-  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
-  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
-  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+  %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100
+  %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4
   %v0 = extractelement <3 x float> %load0, i32 2
 
   %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
   %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
 
-  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
-  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
-  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+  %tmp2 = getelementptr float, ptr addrspace(3) %inptr, i32 26
+  store <2 x float> %data, ptr addrspace(3) %tmp2, align 4
 
-  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
-  %v1 = load float, float addrspace(3)* %vaddr1, align 4
+  %vaddr1 = getelementptr float, ptr addrspace(3) %inptr, i32 26
+  %v1 = load float, ptr addrspace(3) %vaddr1, align 4
 
   %sum = fadd float %v0, %v1
-  store float %sum, float addrspace(1)* %out, align 4
+  store float %sum, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 19a3823a3edc..54b4e83e75a4 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -5,8 +5,8 @@ target datalayout = "A5"
 
 ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
 
-define amdgpu_kernel void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
+define amdgpu_kernel void @test_dynamic_stackalloc(ptr addrspace(1) %out, i32 %n) {
   %alloca = alloca i32, i32 %n, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
index 81807d9d362f..fd5711428c05 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -11,9 +11,9 @@
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc
 ; GCN: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %v = load double, double addrspace(1)* %in
+  %v = load double, ptr addrspace(1) %in
   %cc = fcmp oeq double %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -23,7 +23,7 @@ if:
 
 endif:
   %r = phi double [ %v, %entry ], [ %u, %if ]
-  store double %r, double addrspace(1)* %out
+  store double %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -33,9 +33,9 @@ endif:
 ; GCN: v_add_f64
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
-define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(4)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %v = load double, double addrspace(4)* %in
+  %v = load double, ptr addrspace(4) %in
   %cc = fcmp oeq double %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -45,7 +45,7 @@ if:
 
 endif:
   %r = phi double [ %v, %entry ], [ %u, %if ]
-  store double %r, double addrspace(1)* %out
+  store double %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -64,9 +64,9 @@ endif:
 ; SI-DAG: buffer_store_dwordx2
 ; SI-DAG: buffer_store_dword v
 ; GCNX3: buffer_store_dwordx3
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
 entry:
-  %v = load <3 x i32>, <3 x i32> addrspace(1)* %in
+  %v = load <3 x i32>, ptr addrspace(1) %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -76,7 +76,7 @@ if:
 
 endif:
   %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
-  store <3 x i32> %r, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -95,9 +95,9 @@ endif:
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle128(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
 entry:
-  %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %v = load <4 x i32>, ptr addrspace(1) %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -107,6 +107,6 @@ if:
 
 endif:
   %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
-  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %r, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
index 0ea7662bb86c..15987d112e32 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -9,9 +9,9 @@
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %v = load float, float addrspace(1)* %in
+  %v = load float, ptr addrspace(1) %in
   %cc = fcmp oeq float %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -21,7 +21,7 @@ if:
 
 endif:
   %r = phi float [ %v, %entry ], [ %u, %if ]
-  store float %r, float addrspace(1)* %out
+  store float %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -31,9 +31,9 @@ endif:
 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_diamond(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %v = load float, float addrspace(1)* %in
+  %v = load float, ptr addrspace(1) %in
   %cc = fcmp oeq float %v, 1.000000e+00
   br i1 %cc, label %if, label %else
 
@@ -47,7 +47,7 @@ else:
 
 endif:
   %r = phi float [ %u0, %if ], [ %u1, %else ]
-  store float %r, float addrspace(1)* %out
+  store float %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -57,9 +57,9 @@ endif:
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
 ; GCN: s_mov_b64 vcc, [[CMP]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(ptr addrspace(1) %out, ptr addrspace(1) %in, float %k) #0 {
 entry:
-  %v = load i32, i32 addrspace(1)* %in
+  %v = load i32, ptr addrspace(1) %in
   %cc = fcmp oeq float %k, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -70,7 +70,7 @@ if:
 
 endif:
   %r = phi i32 [ %v, %entry ], [ %u, %if ]
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -86,9 +86,9 @@ endif:
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 ; GCN: v_cndmask_b32_e32
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %v = load float, float addrspace(1)* %in
+  %v = load float, ptr addrspace(1) %in
   %cc = fcmp oeq float %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -106,7 +106,7 @@ if:
 
 endif:
   %r = phi float [ %v, %entry ], [ %u.8, %if ]
-  store float %r, float addrspace(1)* %out
+  store float %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -127,9 +127,9 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %v = load float, float addrspace(1)* %in
+  %v = load float, ptr addrspace(1) %in
   %cc = fcmp oeq float %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -148,7 +148,7 @@ if:
 
 endif:
   %r = phi float [ %v, %entry ], [ %u.9, %if ]
-  store float %r, float addrspace(1)* %out
+  store float %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -161,9 +161,9 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %v = load float, float addrspace(1)* %in
+  %v = load float, ptr addrspace(1) %in
   %cc = fcmp oeq float %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -173,7 +173,7 @@ if:
 
 endif:
   %r = phi float [ %v, %entry ], [ %u, %if ]
-  store float %r, float addrspace(1)* %out
+  store float %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -186,9 +186,9 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(4) %in, float %cnd) #0 {
 entry:
-  %v = load i32, i32 addrspace(4)* %in
+  %v = load i32, ptr addrspace(4) %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -198,16 +198,16 @@ if:
 
 endif:
   %r = phi i32 [ %v, %entry ], [ %u, %if ]
-  store i32 %r, i32 addrspace(1)* %out
+  store i32 %r, ptr addrspace(1) %out
   ret void
 
 }
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
 ; GCN: v_cndmask_b32
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
 entry:
-  %v = load float, float addrspace(4)* %in
+  %v = load float, ptr addrspace(4) %in
   %cc = fcmp oeq float %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -217,7 +217,7 @@ if:
 
 endif:
   %r = phi float [ %v, %entry ], [ %u, %if ]
-  store float %r, float addrspace(1)* %out
+  store float %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -226,7 +226,7 @@ endif:
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
 ; GCN: v_cndmask_b32
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(ptr addrspace(1) %out, float %v) #0 {
 entry:
   %cc = fcmp oeq float %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
@@ -237,7 +237,7 @@ if:
 
 endif:
   %r = phi float [ %v, %entry ], [ %u, %if ]
-  store float %r, float addrspace(1)* %out
+  store float %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -247,9 +247,9 @@ endif:
 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[VAL]], [[ADD]]
-define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(ptr addrspace(4) %in, i32 %cond) #0 {
 entry:
-  %v = load i32, i32 addrspace(4)* %in
+  %v = load i32, ptr addrspace(4) %in
   %cc = icmp eq i32 %cond, 1
   br i1 %cc, label %if, label %endif
 
@@ -273,9 +273,9 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %cond) #0 {
 entry:
-  %v = load float, float addrspace(1)* %in
+  %v = load float, ptr addrspace(1) %in
   %cc = icmp eq i32 %cond, 1
   br i1 %cc, label %if, label %endif
 
@@ -285,7 +285,7 @@ if:
 
 endif:
   %r = phi float [ %v, %entry ], [ %u, %if ]
-  store float %r, float addrspace(1)* %out
+  store float %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -294,9 +294,9 @@ endif:
 ; GCN: s_addc_u32
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(ptr addrspace(4) %in, i32 %cond) #0 {
 entry:
-  %v = load i64, i64 addrspace(4)* %in
+  %v = load i64, ptr addrspace(4) %in
   %cc = icmp eq i32 %cond, 1
   br i1 %cc, label %if, label %endif
 
@@ -319,9 +319,9 @@ endif:
 ; GCN-NEXT: s_cselect_b32 s
 ; GCN-NEXT: s_cselect_b32 s
 ; GCN-NEXT: s_cselect_b32 s
-define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(ptr addrspace(4) %in, i32 %cond) #0 {
 entry:
-  %v = load <3 x i32>, <3 x i32> addrspace(4)* %in
+  %v = load <3 x i32>, ptr addrspace(4) %in
   %cc = icmp eq i32 %cond, 1
   br i1 %cc, label %if, label %endif
 
@@ -344,9 +344,9 @@ endif:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(ptr addrspace(4) %in, i32 %cond) #0 {
 entry:
-  %v = load <4 x i32>, <4 x i32> addrspace(4)* %in
+  %v = load <4 x i32>, ptr addrspace(4) %in
   %cc = icmp eq i32 %cond, 1
   br i1 %cc, label %if, label %endif
 
@@ -363,7 +363,7 @@ endif:
 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
-define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, ptr addrspace(1) %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -376,7 +376,7 @@ else:
 
 done:
   %value = phi i32 [0, %if], [1, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -384,7 +384,7 @@ done:
 ; GCN: {{^}}; %bb.0:
 ; GCN-NEXT: s_load_dwordx2
 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
-define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, ptr addrspace(1) %out) {
 entry:
   br i1 undef, label %else, label %if
 
@@ -396,7 +396,7 @@ else:
 
 done:
   %value = phi i32 [0, %if], [1, %else]
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -409,9 +409,9 @@ done:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
 entry:
-  %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
+  %v = load <8 x i32>, ptr addrspace(1) %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -421,7 +421,7 @@ if:
 
 endif:
   %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
-  store <8 x i32> %r, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 
@@ -434,9 +434,9 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(ptr addrspace(1) %out, ptr addrspace(1) %in, float %cnd) #0 {
 entry:
-  %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
+  %v = load <16 x i32>, ptr addrspace(1) %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -446,7 +446,7 @@ if:
 
 endif:
   %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
-  store <16 x i32> %r, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %r, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/early-inline-alias.ll b/llvm/test/CodeGen/AMDGPU/early-inline-alias.ll
index 42dfa4e7ab4f..d95f78417cea 100644
--- a/llvm/test/CodeGen/AMDGPU/early-inline-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-inline-alias.ll
@@ -1,10 +1,10 @@
 ; RUN: opt -mtriple=amdgcn-- -O1 -S -inline-threshold=1 %s | FileCheck %s
 
-; CHECK: @add1alias = alias i32 (i32), i32 (i32)* @add1
-; CHECK: @add1alias2 = alias i32 (i32), i32 (i32)* @add1
+; CHECK: @add1alias = alias i32 (i32), ptr @add1
+; CHECK: @add1alias2 = alias i32 (i32), ptr @add1
 
- at add1alias = alias i32 (i32), i32 (i32)* @add1
- at add1alias2 = alias i32 (i32), i32 (i32)* @add1
+ at add1alias = alias i32 (i32), ptr @add1
+ at add1alias2 = alias i32 (i32), ptr @add1
 
 define i32 @add1(i32) {
   %2 = add nsw i32 %0, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/early-inline.ll b/llvm/test/CodeGen/AMDGPU/early-inline.ll
index 9ad3de117b1f..c1a049cf055c 100644
--- a/llvm/test/CodeGen/AMDGPU/early-inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-inline.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -mtriple=amdgcn-- -O1 -S -inline-threshold=1 -amdgpu-early-inline-all %s | FileCheck %s
 
- at c_alias = dso_local alias i32 (i32), i32 (i32)* @callee
+ at c_alias = dso_local alias i32 (i32), ptr @callee
 
 define dso_local i32 @callee(i32 %x) {
 entry:
@@ -19,7 +19,7 @@ entry:
 define amdgpu_kernel void @caller(i32 %x) {
 entry:
   %res = call i32 @callee(i32 %x)
-  store volatile i32 %res, i32 addrspace(1)* undef
+  store volatile i32 %res, ptr addrspace(1) undef
   ret void
 }
 
@@ -28,6 +28,6 @@ entry:
 define amdgpu_kernel void @alias_caller(i32 %x) {
 entry:
   %res = call i32 @c_alias(i32 %x)
-  store volatile i32 %res, i32 addrspace(1)* undef
+  store volatile i32 %res, ptr addrspace(1) undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll b/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
index 2565395b8a43..9d077e68d42c 100644
--- a/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
@@ -7,32 +7,32 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; GCN-LABEL: {{^}}fneg_fsub_f32_fmf:
 ; GCN: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; GCN-FMF-NOT: xor
-define amdgpu_kernel void @fneg_fsub_f32_fmf(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fneg_fsub_f32_fmf(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %add = add i32 %tid, 1
-  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 %add
-  %a = load float, float addrspace(1)* %gep, align 4
-  %b = load float, float addrspace(1)* %b_ptr, align 4
+  %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 %add
+  %a = load float, ptr addrspace(1) %gep, align 4
+  %b = load float, ptr addrspace(1) %b_ptr, align 4
   %result = fsub fast float %a, %b
   %neg.result = fsub fast float -0.0, %result
-  store float %neg.result, float addrspace(1)* %out, align 4
+  store float %neg.result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}fneg_fsub_f32_safe:
 ; GCN: v_sub_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
-define amdgpu_kernel void @fneg_fsub_f32_safe(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fneg_fsub_f32_safe(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %add = add i32 %tid, 1
-  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %b_ptr = getelementptr float, float addrspace(1)* %in, i32 %add
-  %a = load float, float addrspace(1)* %gep, align 4
-  %b = load float, float addrspace(1)* %b_ptr, align 4
+  %gep = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 %add
+  %a = load float, ptr addrspace(1) %gep, align 4
+  %b = load float, ptr addrspace(1) %b_ptr, align 4
   %result = fsub float %a, %b
   %neg.result = fsub float -0.0, %result
-  store float %neg.result, float addrspace(1)* %out, align 4
+  store float %neg.result, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/extra-sroa-after-unroll.ll b/llvm/test/CodeGen/AMDGPU/extra-sroa-after-unroll.ll
index 6f50abc58fa5..efdfd86107d2 100644
--- a/llvm/test/CodeGen/AMDGPU/extra-sroa-after-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/extra-sroa-after-unroll.ll
@@ -9,82 +9,73 @@ target datalayout = "A5"
 ; O3-NOT: alloca
 ; GCN-COUNT-27: = load
 ; GCN-COUNT-26: = add
-define protected amdgpu_kernel void @t0(i32 addrspace(1)* %p.coerce) #0 {
+define protected amdgpu_kernel void @t0(ptr addrspace(1) %p.coerce) #0 {
 entry:
-  %p = alloca i32*, align 8, addrspace(5)
-  %p.ascast = addrspacecast i32* addrspace(5)* %p to i32**
-  %p.addr = alloca i32*, align 8, addrspace(5)
-  %p.addr.ascast = addrspacecast i32* addrspace(5)* %p.addr to i32**
+  %p = alloca ptr, align 8, addrspace(5)
+  %p.ascast = addrspacecast ptr addrspace(5) %p to ptr
+  %p.addr = alloca ptr, align 8, addrspace(5)
+  %p.addr.ascast = addrspacecast ptr addrspace(5) %p.addr to ptr
   %t = alloca [27 x i32], align 16, addrspace(5)
-  %t.ascast = addrspacecast [27 x i32] addrspace(5)* %t to [27 x i32]*
+  %t.ascast = addrspacecast ptr addrspace(5) %t to ptr
   %sum = alloca i32, align 4, addrspace(5)
-  %sum.ascast = addrspacecast i32 addrspace(5)* %sum to i32*
+  %sum.ascast = addrspacecast ptr addrspace(5) %sum to ptr
   %i = alloca i32, align 4, addrspace(5)
-  %i.ascast = addrspacecast i32 addrspace(5)* %i to i32*
+  %i.ascast = addrspacecast ptr addrspace(5) %i to ptr
   %cleanup.dest.slot = alloca i32, align 4, addrspace(5)
-  %0 = addrspacecast i32 addrspace(1)* %p.coerce to i32*
-  store i32* %0, i32** %p.ascast, align 8
-  %p1 = load i32*, i32** %p.ascast, align 8
-  store i32* %p1, i32** %p.addr.ascast, align 8
-  %1 = bitcast [27 x i32] addrspace(5)* %t to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %1)
-  %arraydecay = getelementptr inbounds [27 x i32], [27 x i32]* %t.ascast, i64 0, i64 0
-  %2 = load i32*, i32** %p.addr.ascast, align 8
-  call void @copy(i32* %arraydecay, i32* %2, i32 27)
-  %3 = bitcast i32 addrspace(5)* %sum to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 4, i8 addrspace(5)* %3)
-  store i32 0, i32* %sum.ascast, align 4
-  %4 = bitcast i32 addrspace(5)* %i to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 4, i8 addrspace(5)* %4)
-  store i32 0, i32* %i.ascast, align 4
+  %0 = addrspacecast ptr addrspace(1) %p.coerce to ptr
+  store ptr %0, ptr %p.ascast, align 8
+  %p1 = load ptr, ptr %p.ascast, align 8
+  store ptr %p1, ptr %p.addr.ascast, align 8
+  call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %t)
+  %1 = load ptr, ptr %p.addr.ascast, align 8
+  call void @copy(ptr %t.ascast, ptr %1, i32 27)
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %sum)
+  store i32 0, ptr %sum.ascast, align 4
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %i)
+  store i32 0, ptr %i.ascast, align 4
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
-  %5 = load i32, i32* %i.ascast, align 4
-  %cmp = icmp slt i32 %5, 27
+  %2 = load i32, ptr %i.ascast, align 4
+  %cmp = icmp slt i32 %2, 27
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
-  %6 = bitcast i32 addrspace(5)* %i to i8 addrspace(5)*
-  call void @llvm.lifetime.end.p5i8(i64 4, i8 addrspace(5)* %6)
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %i)
   br label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %7 = load i32, i32* %i.ascast, align 4
-  %idxprom = sext i32 %7 to i64
-  %arrayidx = getelementptr inbounds [27 x i32], [27 x i32]* %t.ascast, i64 0, i64 %idxprom
-  %8 = load i32, i32* %arrayidx, align 4
-  %9 = load i32, i32* %sum.ascast, align 4
-  %add = add nsw i32 %9, %8
-  store i32 %add, i32* %sum.ascast, align 4
+  %3 = load i32, ptr %i.ascast, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds [27 x i32], ptr %t.ascast, i64 0, i64 %idxprom
+  %4 = load i32, ptr %arrayidx, align 4
+  %5 = load i32, ptr %sum.ascast, align 4
+  %add = add nsw i32 %5, %4
+  store i32 %add, ptr %sum.ascast, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body
-  %10 = load i32, i32* %i.ascast, align 4
-  %inc = add nsw i32 %10, 1
-  store i32 %inc, i32* %i.ascast, align 4
+  %6 = load i32, ptr %i.ascast, align 4
+  %inc = add nsw i32 %6, 1
+  store i32 %inc, ptr %i.ascast, align 4
   br label %for.cond
 
 for.end:                                          ; preds = %for.cond.cleanup
-  %11 = load i32, i32* %sum.ascast, align 4
-  %12 = load i32*, i32** %p.addr.ascast, align 8
-  store i32 %11, i32* %12, align 4
-  %13 = bitcast i32 addrspace(5)* %sum to i8 addrspace(5)*
-  call void @llvm.lifetime.end.p5i8(i64 4, i8 addrspace(5)* %13)
-  %14 = bitcast [27 x i32] addrspace(5)* %t to i8 addrspace(5)*
-  call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %14)
+  %7 = load i32, ptr %sum.ascast, align 4
+  %8 = load ptr, ptr %p.addr.ascast, align 8
+  store i32 %7, ptr %8, align 4
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %sum)
+  call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %t)
   ret void
 }
 
-define internal void @copy(i32* %d, i32* %s, i32 %N) {
+define internal void @copy(ptr %d, ptr %s, i32 %N) {
 entry:
-  %d8 = bitcast i32* %d to i8*
-  %s8 = bitcast i32* %s to i8*
   %N8 = mul i32 %N, 4
-  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d8, i8* %s8, i32 %N8, i1 false)
+  tail call void @llvm.memcpy.p0.p0.i32(ptr %d, ptr %s, i32 %N8, i1 false)
   ret void
 }
 
-declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
-declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
-declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1)
+declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture)
+declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture)
+declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1)

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll b/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
index be85ca933c33..b0ca6811caff 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
@@ -13,32 +13,32 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
-                                                    <4 x i32> addrspace(1)* noalias %out1,
-                                                    i32 addrspace(1)* noalias %out2,
-                                                    i32 addrspace(1)* %in) {
-  %elt0 = load volatile i32, i32 addrspace(1)* %in
-  %elt1 = load volatile i32, i32 addrspace(1)* %in
-  %elt2 = load volatile i32, i32 addrspace(1)* %in
-  %elt3 = load volatile i32, i32 addrspace(1)* %in
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32(ptr addrspace(1) noalias %out0,
+                                                    ptr addrspace(1) noalias %out1,
+                                                    ptr addrspace(1) noalias %out2,
+                                                    ptr addrspace(1) %in) {
+  %elt0 = load volatile i32, ptr addrspace(1) %in
+  %elt1 = load volatile i32, ptr addrspace(1) %in
+  %elt2 = load volatile i32, ptr addrspace(1) %in
+  %elt3 = load volatile i32, ptr addrspace(1) %in
 
   %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
   %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
   %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
   %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
 
-  store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0
-  store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out1
+  store <4 x i32> %vec3, ptr addrspace(1) %out0
+  store <4 x i32> %vec3, ptr addrspace(1) %out1
 
   %extract0 = extractelement <4 x i32> %vec3, i32 0
   %extract1 = extractelement <4 x i32> %vec3, i32 1
   %extract2 = extractelement <4 x i32> %vec3, i32 2
   %extract3 = extractelement <4 x i32> %vec3, i32 3
 
-  store volatile i32 %extract0, i32 addrspace(1)* %out2
-  store volatile i32 %extract1, i32 addrspace(1)* %out2
-  store volatile i32 %extract2, i32 addrspace(1)* %out2
-  store volatile i32 %extract3, i32 addrspace(1)* %out2
+  store volatile i32 %extract0, ptr addrspace(1) %out2
+  store volatile i32 %extract1, ptr addrspace(1) %out2
+  store volatile i32 %extract2, ptr addrspace(1) %out2
+  store volatile i32 %extract3, ptr addrspace(1) %out2
 
   ret void
 }
@@ -55,14 +55,14 @@ define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32(<4 x i32> addr
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
-                                                            <4 x i32> addrspace(1)* noalias %out1,
-                                                            i32 addrspace(1)* noalias %out2,
-                                                            i32 addrspace(1)* %in) {
-  %elt0 = load volatile i32, i32 addrspace(1)* %in
-  %elt1 = load volatile i32, i32 addrspace(1)* %in
-  %elt2 = load volatile i32, i32 addrspace(1)* %in
-  %elt3 = load volatile i32, i32 addrspace(1)* %in
+define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(ptr addrspace(1) noalias %out0,
+                                                            ptr addrspace(1) noalias %out1,
+                                                            ptr addrspace(1) noalias %out2,
+                                                            ptr addrspace(1) %in) {
+  %elt0 = load volatile i32, ptr addrspace(1) %in
+  %elt1 = load volatile i32, ptr addrspace(1) %in
+  %elt2 = load volatile i32, ptr addrspace(1) %in
+  %elt3 = load volatile i32, ptr addrspace(1) %in
 
   %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
   %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
@@ -79,12 +79,12 @@ define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(<4 x i
   %op2 = xor i32 %extract2, 1231412
   %op3 = and i32 %extract3, 258233412312
 
-  store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0
+  store <4 x i32> %vec3, ptr addrspace(1) %out0
 
-  store volatile i32 %op0, i32 addrspace(1)* %out2
-  store volatile i32 %op1, i32 addrspace(1)* %out2
-  store volatile i32 %op2, i32 addrspace(1)* %out2
-  store volatile i32 %op3, i32 addrspace(1)* %out2
+  store volatile i32 %op0, ptr addrspace(1) %out2
+  store volatile i32 %op1, ptr addrspace(1) %out2
+  store volatile i32 %op2, ptr addrspace(1) %out2
+  store volatile i32 %op3, ptr addrspace(1) %out2
 
   ret void
 }
@@ -99,14 +99,14 @@ define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(<4 x i
 
 ; GCN: buffer_store_dwordx2
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
-                                                                     <4 x i32> addrspace(1)* noalias %out1,
-                                                                     i64 addrspace(1)* noalias %out2,
-                                                                     i32 addrspace(1)* %in) {
-  %elt0 = load volatile i32, i32 addrspace(1)* %in
-  %elt1 = load volatile i32, i32 addrspace(1)* %in
-  %elt2 = load volatile i32, i32 addrspace(1)* %in
-  %elt3 = load volatile i32, i32 addrspace(1)* %in
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(ptr addrspace(1) noalias %out0,
+                                                                     ptr addrspace(1) noalias %out1,
+                                                                     ptr addrspace(1) noalias %out2,
+                                                                     ptr addrspace(1) %in) {
+  %elt0 = load volatile i32, ptr addrspace(1) %in
+  %elt1 = load volatile i32, ptr addrspace(1) %in
+  %elt2 = load volatile i32, ptr addrspace(1) %in
+  %elt3 = load volatile i32, ptr addrspace(1) %in
 
   %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
   %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
@@ -114,13 +114,13 @@ define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i
   %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
 
   %bc.vec3 = bitcast <4 x i32> %vec3 to <2 x i64>
-  store <2 x i64> %bc.vec3, <2 x i64> addrspace(1)* %out0
+  store <2 x i64> %bc.vec3, ptr addrspace(1) %out0
 
   %extract0 = extractelement <2 x i64> %bc.vec3, i32 0
   %extract1 = extractelement <2 x i64> %bc.vec3, i32 1
 
-  store volatile i64 %extract0, i64 addrspace(1)* %out2
-  store volatile i64 %extract1, i64 addrspace(1)* %out2
+  store volatile i64 %extract0, ptr addrspace(1) %out2
+  store volatile i64 %extract1, ptr addrspace(1) %out2
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fceil.ll b/llvm/test/CodeGen/AMDGPU/fceil.ll
index 0b913fda8580..07c8b0409ed2 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil.ll
@@ -13,9 +13,9 @@ declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone
 ; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define amdgpu_kernel void @fceil_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @fceil_f32(ptr addrspace(1) %out, float %x) {
   %y = call float @llvm.ceil.f32(float %x) nounwind readnone
-  store float %y, float addrspace(1)* %out
+  store float %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -25,9 +25,9 @@ define amdgpu_kernel void @fceil_f32(float addrspace(1)* %out, float %x) {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define amdgpu_kernel void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fceil_v2f32(ptr addrspace(1) %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone
-  store <2 x float> %y, <2 x float> addrspace(1)* %out
+  store <2 x float> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -41,9 +41,9 @@ define amdgpu_kernel void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x floa
 ; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define amdgpu_kernel void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+define amdgpu_kernel void @fceil_v3f32(ptr addrspace(1) %out, <3 x float> %x) {
   %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone
-  store <3 x float> %y, <3 x float> addrspace(1)* %out
+  store <3 x float> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -57,9 +57,9 @@ define amdgpu_kernel void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x floa
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define amdgpu_kernel void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fceil_v4f32(ptr addrspace(1) %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
-  store <4 x float> %y, <4 x float> addrspace(1)* %out
+  store <4 x float> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -82,9 +82,9 @@ define amdgpu_kernel void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x floa
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define amdgpu_kernel void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+define amdgpu_kernel void @fceil_v8f32(ptr addrspace(1) %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone
-  store <8 x float> %y, <8 x float> addrspace(1)* %out
+  store <8 x float> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -125,8 +125,8 @@ define amdgpu_kernel void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x floa
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
-define amdgpu_kernel void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+define amdgpu_kernel void @fceil_v16f32(ptr addrspace(1) %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone
-  store <16 x float> %y, <16 x float> addrspace(1)* %out
+  store <16 x float> %y, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index a5787714fb7b..7889077ccab0 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -29,18 +29,18 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; SI-DAG: s_cselect_b32 s{{[0-9]+}}, 0x3ff00000, 0
 ; SI: v_add_f64
 ; SI: s_endpgm
-define amdgpu_kernel void @fceil_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @fceil_f64(ptr addrspace(1) %out, double %x) {
   %y = call double @llvm.ceil.f64(double %x) nounwind readnone
-  store double %y, double addrspace(1)* %out
+  store double %y, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}fceil_v2f64:
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define amdgpu_kernel void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @fceil_v2f64(ptr addrspace(1) %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
-  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  store <2 x double> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -48,9 +48,9 @@ define amdgpu_kernel void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
-; define amdgpu_kernel void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+; define amdgpu_kernel void @fceil_v3f64(ptr addrspace(1) %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
-;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   store <3 x double> %y, ptr addrspace(1) %out
 ;   ret void
 ; }
 
@@ -59,9 +59,9 @@ define amdgpu_kernel void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define amdgpu_kernel void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @fceil_v4f64(ptr addrspace(1) %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
-  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  store <4 x double> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -74,9 +74,9 @@ define amdgpu_kernel void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define amdgpu_kernel void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @fceil_v8f64(ptr addrspace(1) %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
-  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  store <8 x double> %y, ptr addrspace(1) %out
   ret void
 }
 
@@ -97,8 +97,8 @@ define amdgpu_kernel void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define amdgpu_kernel void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @fceil_v16f64(ptr addrspace(1) %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
-  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  store <16 x double> %y, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fence-barrier.ll b/llvm/test/CodeGen/AMDGPU/fence-barrier.ll
index 106c88be38ae..144bbc249ecd 100644
--- a/llvm/test/CodeGen/AMDGPU/fence-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/fence-barrier.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
 ; RUN: llvm-as -data-layout=A5 < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=GCN %s
 
-declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.workgroup.id.x()
 declare void @llvm.amdgcn.s.barrier()
@@ -17,41 +17,39 @@ declare void @llvm.amdgcn.s.barrier()
 ; GCN: s_waitcnt lgkmcnt(0){{$}}
 ; GCN-NEXT: s_barrier
 ; GCN: flat_store_dword
-define amdgpu_kernel void @test_local(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @test_local(ptr addrspace(1) %arg) {
 bb:
-  %i = alloca i32 addrspace(1)*, align 4, addrspace(5)
-  store i32 addrspace(1)* %arg, i32 addrspace(1)* addrspace(5)* %i, align 4
+  %i = alloca ptr addrspace(1), align 4, addrspace(5)
+  store ptr addrspace(1) %arg, ptr addrspace(5) %i, align 4
   %i1 = call i32 @llvm.amdgcn.workitem.id.x()
   %i2 = zext i32 %i1 to i64
   %i3 = icmp eq i64 %i2, 0
   br i1 %i3, label %bb4, label %bb5
 
 bb4:                                              ; preds = %bb
-  store i32 1911, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @test_local.temp, i64 0, i64 0), align 4
+  store i32 1911, ptr addrspace(3) @test_local.temp, align 4
   br label %bb5
 
 bb5:                                              ; preds = %bb4, %bb
   fence syncscope("workgroup") release
   call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  %i6 = load i32, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @test_local.temp, i64 0, i64 0), align 4
-  %i7 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %i, align 4
-  %i8 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i6 = load i32, ptr addrspace(3) @test_local.temp, align 4
+  %i7 = load ptr addrspace(1), ptr addrspace(5) %i, align 4
+  %i8 = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
   %i9 = call i32 @llvm.amdgcn.workitem.id.x()
   %i10 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %i11 = getelementptr inbounds i8, i8 addrspace(4)* %i8, i64 4
-  %i12 = bitcast i8 addrspace(4)* %i11 to i16 addrspace(4)*
-  %i13 = load i16, i16 addrspace(4)* %i12, align 4
+  %i11 = getelementptr inbounds i8, ptr addrspace(4) %i8, i64 4
+  %i13 = load i16, ptr addrspace(4) %i11, align 4
   %i14 = zext i16 %i13 to i32
   %i15 = mul i32 %i10, %i14
   %i16 = add i32 %i15, %i9
-  %i17 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %i17 = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %i18 = zext i32 %i16 to i64
-  %i19 = bitcast i8 addrspace(4)* %i17 to i64 addrspace(4)*
-  %i20 = load i64, i64 addrspace(4)* %i19, align 8
+  %i20 = load i64, ptr addrspace(4) %i17, align 8
   %i21 = add i64 %i20, %i18
-  %i22 = getelementptr inbounds i32, i32 addrspace(1)* %i7, i64 %i21
-  store i32 %i6, i32 addrspace(1)* %i22, align 4
+  %i22 = getelementptr inbounds i32, ptr addrspace(1) %i7, i64 %i21
+  store i32 %i6, ptr addrspace(1) %i22, align 4
   ret void
 }
 
@@ -60,80 +58,74 @@ bb5:                                              ; preds = %bb4, %bb
 ; GCN: flat_store_dword
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT: s_barrier
-define amdgpu_kernel void @test_global(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @test_global(ptr addrspace(1) %arg) {
 bb:
-  %i = alloca i32 addrspace(1)*, align 4, addrspace(5)
+  %i = alloca ptr addrspace(1), align 4, addrspace(5)
   %i1 = alloca i32, align 4, addrspace(5)
-  store i32 addrspace(1)* %arg, i32 addrspace(1)* addrspace(5)* %i, align 4
-  store i32 0, i32 addrspace(5)* %i1, align 4
+  store ptr addrspace(1) %arg, ptr addrspace(5) %i, align 4
+  store i32 0, ptr addrspace(5) %i1, align 4
   br label %bb2
 
 bb2:                                              ; preds = %bb56, %bb
-  %i3 = load i32, i32 addrspace(5)* %i1, align 4
+  %i3 = load i32, ptr addrspace(5) %i1, align 4
   %i4 = sext i32 %i3 to i64
-  %i5 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i5 = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
   %i6 = call i32 @llvm.amdgcn.workitem.id.x()
   %i7 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %i8 = getelementptr inbounds i8, i8 addrspace(4)* %i5, i64 4
-  %i9 = bitcast i8 addrspace(4)* %i8 to i16 addrspace(4)*
-  %i10 = load i16, i16 addrspace(4)* %i9, align 4
+  %i8 = getelementptr inbounds i8, ptr addrspace(4) %i5, i64 4
+  %i10 = load i16, ptr addrspace(4) %i8, align 4
   %i11 = zext i16 %i10 to i32
   %i12 = mul i32 %i7, %i11
   %i13 = add i32 %i12, %i6
-  %i14 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %i14 = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %i15 = zext i32 %i13 to i64
-  %i16 = bitcast i8 addrspace(4)* %i14 to i64 addrspace(4)*
-  %i17 = load i64, i64 addrspace(4)* %i16, align 8
+  %i17 = load i64, ptr addrspace(4) %i14, align 8
   %i18 = add i64 %i17, %i15
   %i19 = icmp ult i64 %i4, %i18
   br i1 %i19, label %bb20, label %bb59
 
 bb20:                                             ; preds = %bb2
-  %i21 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i21 = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
   %i22 = call i32 @llvm.amdgcn.workitem.id.x()
   %i23 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %i24 = getelementptr inbounds i8, i8 addrspace(4)* %i21, i64 4
-  %i25 = bitcast i8 addrspace(4)* %i24 to i16 addrspace(4)*
-  %i26 = load i16, i16 addrspace(4)* %i25, align 4
+  %i24 = getelementptr inbounds i8, ptr addrspace(4) %i21, i64 4
+  %i26 = load i16, ptr addrspace(4) %i24, align 4
   %i27 = zext i16 %i26 to i32
   %i28 = mul i32 %i23, %i27
   %i29 = add i32 %i28, %i22
-  %i30 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %i30 = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %i31 = zext i32 %i29 to i64
-  %i32 = bitcast i8 addrspace(4)* %i30 to i64 addrspace(4)*
-  %i33 = load i64, i64 addrspace(4)* %i32, align 8
+  %i33 = load i64, ptr addrspace(4) %i30, align 8
   %i34 = add i64 %i33, %i31
   %i35 = add i64 %i34, 2184
   %i36 = trunc i64 %i35 to i32
-  %i37 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %i, align 4
-  %i38 = load i32, i32 addrspace(5)* %i1, align 4
+  %i37 = load ptr addrspace(1), ptr addrspace(5) %i, align 4
+  %i38 = load i32, ptr addrspace(5) %i1, align 4
   %i39 = sext i32 %i38 to i64
-  %i40 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i40 = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
   %i41 = call i32 @llvm.amdgcn.workitem.id.x()
   %i42 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %i43 = getelementptr inbounds i8, i8 addrspace(4)* %i40, i64 4
-  %i44 = bitcast i8 addrspace(4)* %i43 to i16 addrspace(4)*
-  %i45 = load i16, i16 addrspace(4)* %i44, align 4
+  %i43 = getelementptr inbounds i8, ptr addrspace(4) %i40, i64 4
+  %i45 = load i16, ptr addrspace(4) %i43, align 4
   %i46 = zext i16 %i45 to i32
   %i47 = mul i32 %i42, %i46
   %i48 = add i32 %i47, %i41
-  %i49 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %i49 = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %i50 = zext i32 %i48 to i64
-  %i51 = bitcast i8 addrspace(4)* %i49 to i64 addrspace(4)*
-  %i52 = load i64, i64 addrspace(4)* %i51, align 8
+  %i52 = load i64, ptr addrspace(4) %i49, align 8
   %i53 = add i64 %i52, %i50
   %i54 = add i64 %i39, %i53
-  %i55 = getelementptr inbounds i32, i32 addrspace(1)* %i37, i64 %i54
-  store i32 %i36, i32 addrspace(1)* %i55, align 4
+  %i55 = getelementptr inbounds i32, ptr addrspace(1) %i37, i64 %i54
+  store i32 %i36, ptr addrspace(1) %i55, align 4
   fence syncscope("workgroup") release
   call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
   br label %bb56
 
 bb56:                                             ; preds = %bb20
-  %i57 = load i32, i32 addrspace(5)* %i1, align 4
+  %i57 = load i32, ptr addrspace(5) %i1, align 4
   %i58 = add nsw i32 %i57, 1
-  store i32 %i58, i32 addrspace(5)* %i1, align 4
+  store i32 %i58, ptr addrspace(5) %i1, align 4
   br label %bb2
 
 bb59:                                             ; preds = %bb2
@@ -146,57 +138,53 @@ bb59:                                             ; preds = %bb2
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT: s_barrier
 ; GCN: flat_store_dword
-define amdgpu_kernel void @test_global_local(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @test_global_local(ptr addrspace(1) %arg) {
 bb:
-  %i = alloca i32 addrspace(1)*, align 4, addrspace(5)
-  store i32 addrspace(1)* %arg, i32 addrspace(1)* addrspace(5)* %i, align 4
-  %i1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %i, align 4
-  %i2 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i = alloca ptr addrspace(1), align 4, addrspace(5)
+  store ptr addrspace(1) %arg, ptr addrspace(5) %i, align 4
+  %i1 = load ptr addrspace(1), ptr addrspace(5) %i, align 4
+  %i2 = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
   %i3 = call i32 @llvm.amdgcn.workitem.id.x()
   %i4 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %i5 = getelementptr inbounds i8, i8 addrspace(4)* %i2, i64 4
-  %i6 = bitcast i8 addrspace(4)* %i5 to i16 addrspace(4)*
-  %i7 = load i16, i16 addrspace(4)* %i6, align 4
+  %i5 = getelementptr inbounds i8, ptr addrspace(4) %i2, i64 4
+  %i7 = load i16, ptr addrspace(4) %i5, align 4
   %i8 = zext i16 %i7 to i32
   %i9 = mul i32 %i4, %i8
   %i10 = add i32 %i9, %i3
-  %i11 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %i11 = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %i12 = zext i32 %i10 to i64
-  %i13 = bitcast i8 addrspace(4)* %i11 to i64 addrspace(4)*
-  %i14 = load i64, i64 addrspace(4)* %i13, align 8
+  %i14 = load i64, ptr addrspace(4) %i11, align 8
   %i15 = add i64 %i14, %i12
-  %i16 = getelementptr inbounds i32, i32 addrspace(1)* %i1, i64 %i15
-  store i32 1, i32 addrspace(1)* %i16, align 4
+  %i16 = getelementptr inbounds i32, ptr addrspace(1) %i1, i64 %i15
+  store i32 1, ptr addrspace(1) %i16, align 4
   %i17 = call i32 @llvm.amdgcn.workitem.id.x()
   %i18 = zext i32 %i17 to i64
   %i19 = icmp eq i64 %i18, 0
   br i1 %i19, label %bb20, label %bb21
 
 bb20:                                             ; preds = %bb
-  store i32 2457, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @test_global_local.temp, i64 0, i64 0), align 4
+  store i32 2457, ptr addrspace(3) @test_global_local.temp, align 4
   br label %bb21
 
 bb21:                                             ; preds = %bb20, %bb
   fence syncscope("workgroup") release
   call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  %i22 = load i32, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @test_global_local.temp, i64 0, i64 0), align 4
-  %i23 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %i, align 4
-  %i24 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %i22 = load i32, ptr addrspace(3) @test_global_local.temp, align 4
+  %i23 = load ptr addrspace(1), ptr addrspace(5) %i, align 4
+  %i24 = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
   %i25 = call i32 @llvm.amdgcn.workitem.id.x()
   %i26 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %i27 = getelementptr inbounds i8, i8 addrspace(4)* %i24, i64 4
-  %i28 = bitcast i8 addrspace(4)* %i27 to i16 addrspace(4)*
-  %i29 = load i16, i16 addrspace(4)* %i28, align 4
+  %i27 = getelementptr inbounds i8, ptr addrspace(4) %i24, i64 4
+  %i29 = load i16, ptr addrspace(4) %i27, align 4
   %i30 = zext i16 %i29 to i32
   %i31 = mul i32 %i26, %i30
   %i32 = add i32 %i31, %i25
-  %i33 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
+  %i33 = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %i34 = zext i32 %i32 to i64
-  %i35 = bitcast i8 addrspace(4)* %i33 to i64 addrspace(4)*
-  %i36 = load i64, i64 addrspace(4)* %i35, align 8
+  %i36 = load i64, ptr addrspace(4) %i33, align 8
   %i37 = add i64 %i36, %i34
-  %i38 = getelementptr inbounds i32, i32 addrspace(1)* %i23, i64 %i37
-  store i32 %i22, i32 addrspace(1)* %i38, align 4
+  %i38 = getelementptr inbounds i32, ptr addrspace(1) %i23, i64 %i37
+  store i32 %i22, ptr addrspace(1) %i38, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
index a1bd12d5b2ff..68f481316944 100644
--- a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
@@ -32,33 +32,33 @@ define amdgpu_kernel void @same_address_fence_merge_write2() #0 {
 ; GCN-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
-  %tmp1 = getelementptr inbounds [576 x double], [576 x double] addrspace(3)* @lds, i32 0, i32 %tmp
-  store double 4.000000e+00, double addrspace(3)* %tmp1, align 8
-  %tmp2 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 66
-  store double 4.000000e+00, double addrspace(3)* %tmp2, align 8
-  %tmp3 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 132
-  store double 4.000000e+00, double addrspace(3)* %tmp3, align 8
-  %tmp4 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 198
-  store double 4.000000e+00, double addrspace(3)* %tmp4, align 8
-  %tmp5 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 264
-  store double 4.000000e+00, double addrspace(3)* %tmp5, align 8
-  %tmp6 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 330
-  store double 4.000000e+00, double addrspace(3)* %tmp6, align 8
-  %tmp7 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 396
-  store double 4.000000e+00, double addrspace(3)* %tmp7, align 8
-  %tmp8 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 462
-  store double 4.000000e+00, double addrspace(3)* %tmp8, align 8
+  %tmp1 = getelementptr inbounds [576 x double], ptr addrspace(3) @lds, i32 0, i32 %tmp
+  store double 4.000000e+00, ptr addrspace(3) %tmp1, align 8
+  %tmp2 = getelementptr inbounds double, ptr addrspace(3) %tmp1, i32 66
+  store double 4.000000e+00, ptr addrspace(3) %tmp2, align 8
+  %tmp3 = getelementptr inbounds double, ptr addrspace(3) %tmp1, i32 132
+  store double 4.000000e+00, ptr addrspace(3) %tmp3, align 8
+  %tmp4 = getelementptr inbounds double, ptr addrspace(3) %tmp1, i32 198
+  store double 4.000000e+00, ptr addrspace(3) %tmp4, align 8
+  %tmp5 = getelementptr inbounds double, ptr addrspace(3) %tmp1, i32 264
+  store double 4.000000e+00, ptr addrspace(3) %tmp5, align 8
+  %tmp6 = getelementptr inbounds double, ptr addrspace(3) %tmp1, i32 330
+  store double 4.000000e+00, ptr addrspace(3) %tmp6, align 8
+  %tmp7 = getelementptr inbounds double, ptr addrspace(3) %tmp1, i32 396
+  store double 4.000000e+00, ptr addrspace(3) %tmp7, align 8
+  %tmp8 = getelementptr inbounds double, ptr addrspace(3) %tmp1, i32 462
+  store double 4.000000e+00, ptr addrspace(3) %tmp8, align 8
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
-  store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
-  store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
-  store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
-  store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
-  store double 1.000000e+00, double addrspace(3)* %tmp6, align 8
-  store double 1.000000e+00, double addrspace(3)* %tmp7, align 8
-  store double 1.000000e+00, double addrspace(3)* %tmp8, align 8
+  store double 1.000000e+00, ptr addrspace(3) %tmp1, align 8
+  store double 1.000000e+00, ptr addrspace(3) %tmp2, align 8
+  store double 1.000000e+00, ptr addrspace(3) %tmp3, align 8
+  store double 1.000000e+00, ptr addrspace(3) %tmp4, align 8
+  store double 1.000000e+00, ptr addrspace(3) %tmp5, align 8
+  store double 1.000000e+00, ptr addrspace(3) %tmp6, align 8
+  store double 1.000000e+00, ptr addrspace(3) %tmp7, align 8
+  store double 1.000000e+00, ptr addrspace(3) %tmp8, align 8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll
index dc400e06cb12..54e3ec0ef990 100644
--- a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll
@@ -7,15 +7,15 @@
 ; GCN-NOT: load_lds_simple
 
 define internal i32 @load_lds_simple() {
-  %load = load i32, i32 addrspace(3)* @lds0, align 4
+  %load = load i32, ptr addrspace(3) @lds0, align 4
   ret i32 %load
 }
 
 ; GCN-LABEL: {{^}}kernel:
 ; GCN: v_mov_b32_e32 [[ADDR:v[0-9]+]], 0
 ; GCN: ds_read_b32 v{{[0-9]+}}, [[ADDR]]
-define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel(ptr addrspace(1) %out) {
   %call = call i32 @load_lds_simple()
-  store i32 %call, i32 addrspace(1)* %out
+  store i32 %call, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll
index 99b334691538..a11a5c674cf1 100644
--- a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll
+++ b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll
@@ -7,59 +7,59 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 
 @lds0 = addrspace(3) global i32 undef, align 4
 @lds1 = addrspace(3) global [512 x i32] undef, align 4
- at nested.lds.address = addrspace(1) global i32 addrspace(3)* @lds0, align 4
+ at nested.lds.address = addrspace(1) global ptr addrspace(3) @lds0, align 4
 @gds0 = addrspace(2) global i32 undef, align 4
 
- at alias.lds0 = alias i32, i32 addrspace(3)* @lds0
- at lds.cycle = addrspace(3) global i32 ptrtoint (i32 addrspace(3)* @lds.cycle to i32), align 4
+ at alias.lds0 = alias i32, ptr addrspace(3) @lds0
+ at lds.cycle = addrspace(3) global i32 ptrtoint (ptr addrspace(3) @lds.cycle to i32), align 4
 
 
 ; ALL-LABEL: define i32 @load_lds_simple() #0 {
 define i32 @load_lds_simple() {
-  %load = load i32, i32 addrspace(3)* @lds0, align 4
+  %load = load i32, ptr addrspace(3) @lds0, align 4
   ret i32 %load
 }
 
 ; ALL-LABEL: define i32 @load_gds_simple() #0 {
 define i32 @load_gds_simple() {
-  %load = load i32, i32 addrspace(2)* @gds0, align 4
+  %load = load i32, ptr addrspace(2) @gds0, align 4
   ret i32 %load
 }
 
 ; ALL-LABEL: define i32 @load_lds_const_gep() #0 {
 define i32 @load_lds_const_gep() {
-  %load = load i32, i32 addrspace(3)* getelementptr inbounds ([512 x i32], [512 x i32] addrspace(3)* @lds1, i64 0, i64 4), align 4
+  %load = load i32, ptr addrspace(3) getelementptr inbounds ([512 x i32], ptr addrspace(3) @lds1, i64 0, i64 4), align 4
   ret i32 %load
 }
 
 ; ALL-LABEL: define i32 @load_lds_var_gep(i32 %idx) #0 {
 define i32 @load_lds_var_gep(i32 %idx) {
-  %gep = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
-  %load = load i32, i32 addrspace(3)* %gep, align 4
+  %gep = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds1, i32 0, i32 %idx
+  %load = load i32, ptr addrspace(3) %gep, align 4
   ret i32 %load
 }
 
-; ALL-LABEL: define i32 addrspace(3)* @load_nested_address(i32 %idx) #0 {
-define i32 addrspace(3)* @load_nested_address(i32 %idx) {
-  %load = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(1)* @nested.lds.address, align 4
-  ret i32 addrspace(3)* %load
+; ALL-LABEL: define ptr addrspace(3) @load_nested_address(i32 %idx) #0 {
+define ptr addrspace(3) @load_nested_address(i32 %idx) {
+  %load = load ptr addrspace(3), ptr addrspace(1) @nested.lds.address, align 4
+  ret ptr addrspace(3) %load
 }
 
 ; ALL-LABEL: define i32 @load_lds_alias() #0 {
 define i32 @load_lds_alias() {
-  %load = load i32, i32 addrspace(3)* @alias.lds0, align 4
+  %load = load i32, ptr addrspace(3) @alias.lds0, align 4
   ret i32 %load
 }
 
 ; ALL-LABEL: define i32 @load_lds_cycle() #0 {
 define i32 @load_lds_cycle() {
-  %load = load i32, i32 addrspace(3)* @lds.cycle, align 4
+  %load = load i32, ptr addrspace(3) @lds.cycle, align 4
   ret i32 %load
 }
 
 ; ALL-LABEL: define i1 @icmp_lds_address() #0 {
 define i1 @icmp_lds_address() {
-  ret i1 icmp eq (i32 addrspace(3)* @lds0, i32 addrspace(3)* null)
+  ret i1 icmp eq (ptr addrspace(3) @lds0, ptr addrspace(3) null)
 }
 
 ; ALL-LABEL: define i32 @transitive_call() #0 {
@@ -70,7 +70,7 @@ define i32 @transitive_call() {
 
 ; ALL-LABEL: define i32 @recursive_call_lds(i32 %arg0) #0 {
 define i32 @recursive_call_lds(i32 %arg0) {
-  %load = load i32, i32 addrspace(3)* @lds0, align 4
+  %load = load i32, ptr addrspace(3) @lds0, align 4
   %add = add i32 %arg0, %load
   %call = call i32 @recursive_call_lds(i32 %add)
   ret i32 %call
@@ -81,13 +81,13 @@ define i32 @recursive_call_lds(i32 %arg0) {
 
 ; ALL-LABEL: define i32 @load_lds_simple_noinline() #0 {
 define i32 @load_lds_simple_noinline() noinline {
-  %load = load i32, i32 addrspace(3)* @lds0, align 4
+  %load = load i32, ptr addrspace(3) @lds0, align 4
   ret i32 %load
 }
 
 ; ALL-LABEL: define i32 @recursive_call_lds_noinline(i32 %arg0) #0 {
 define i32 @recursive_call_lds_noinline(i32 %arg0) noinline {
-  %load = load i32, i32 addrspace(3)* @lds0, align 4
+  %load = load i32, ptr addrspace(3) @lds0, align 4
   %add = add i32 %arg0, %load
   %call = call i32 @recursive_call_lds(i32 %add)
   ret i32 %call

diff  --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 5ccbc52a6f22..0872c6160199 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
 
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data)
-declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data)
+declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
+declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
 
 ; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
-declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data)
-declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data)
-declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3) * %ptr, <2 x half> %data, i32, i32, i1)
-declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3) * %ptr, <2 x i16> %data)
+declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
+declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
+declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1)
+declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
 
-define amdgpu_kernel void @flat_atomic_fadd_f32_noret(float* %ptr, float %data) {
+define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
 ; GFX940-LABEL: flat_atomic_fadd_f32_noret:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -20,11 +20,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(float* %ptr, float %data)
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
 ; GFX940-NEXT:    s_endpgm
-  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(float* %ptr) {
+define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
 ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -49,11 +49,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(float* %ptr) {
 ; GFX940-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_endpgm
-  %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
+  %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(float* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
 ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -78,22 +78,22 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(float* %ptr) #0 {
 ; GFX940-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX940-NEXT:    s_endpgm
-  %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
+  %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
   ret void
 }
 
-define float @flat_atomic_fadd_f32_rtn(float* %ptr, float %data) {
+define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
 ; GFX940-LABEL: flat_atomic_fadd_f32_rtn:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data)
+  %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
   ret float %ret
 }
 
-define float @flat_atomic_fadd_f32_rtn_pat(float* %ptr, float %data) {
+define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
 ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -117,11 +117,11 @@ define float @flat_atomic_fadd_f32_rtn_pat(float* %ptr, float %data) {
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX940-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
-  %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
+  %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
   ret float %ret
 }
 
-define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(<2 x half>* %ptr, <2 x half> %data) {
+define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
 ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -131,22 +131,22 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(<2 x half>* %ptr, <2 x h
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
 ; GFX940-NEXT:    s_endpgm
-  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
   ret void
 }
 
-define <2 x half> @flat_atomic_fadd_v2f16_rtn(<2 x half>* %ptr, <2 x half> %data) {
+define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
 ; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data)
+  %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
   ret <2 x half> %ret
 }
 
-define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(<2 x i16>* %ptr, <2 x i16> %data) {
+define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -156,22 +156,22 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(<2 x i16>* %ptr, <2 x i
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX940-NEXT:    s_endpgm
-  %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data)
+  %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
   ret void
 }
 
-define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(<2 x i16>* %ptr, <2 x i16> %data) {
+define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data)
+  %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
 }
 
-define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) {
+define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: global_atomic_fadd_v2bf16_noret:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -181,22 +181,22 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(1
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v1, s[2:3]
 ; GFX940-NEXT:    s_endpgm
-  %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data)
+  %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
   ret void
 }
 
-define <2 x i16> @global_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) {
+define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data)
+  %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
 }
 
-define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) {
+define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -205,22 +205,22 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX940-NEXT:    s_endpgm
-  %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
+  %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
   ret void
 }
 
-define <2 x half> @local_atomic_fadd_v2f16_rtn(<2 x half> addrspace(3)* %ptr, <2 x half> %data) {
+define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2f16_rtn:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_pk_add_rtn_f16 v0, v0, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
+  %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
   ret <2 x half> %ret
 }
 
-define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) {
+define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -233,11 +233,11 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3)
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
-  %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data)
+  %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
   ret void
 }
 
-define <2 x i16> @local_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) {
+define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -247,7 +247,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(3)* %ptr, <2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
-  %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data)
+  %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index f5104eba35ec..078387b008a0 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -8,14 +8,14 @@ declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, <4 x i32>, i32
 declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg)
 declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
 declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
-declare double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
-declare double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
-declare double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data)
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %ptr, double %data)
-declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data)
-declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data)
-declare double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* nocapture, double, i32, i32, i1)
+declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
+declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
+declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
+declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
+declare double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data)
+declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
+declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
+declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
 
 define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
 ; GFX90A-LABEL: buffer_atomic_add_noret_f64:
@@ -42,11 +42,11 @@ define amdgpu_ps void @buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
-  store double %ret, double* undef
+  store double %ret, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -63,7 +63,7 @@ define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %r
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
-  store double %ret, double addrspace(1)* %out, align 8
+  store double %ret, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -92,11 +92,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  store double %ret, double* undef
+  store double %ret, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -113,7 +113,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inre
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
-  store double %ret, double addrspace(1)* %out, align 8
+  store double %ret, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -142,11 +142,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
-  store double %ret, double* undef
+  store double %ret, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -163,7 +163,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> i
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
-  store double %ret, double addrspace(1)* %out, align 8
+  store double %ret, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -192,11 +192,11 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  store double %ret, double* undef
+  store double %ret, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -213,7 +213,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inre
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
-  store double %ret, double addrspace(1)* %out, align 8
+  store double %ret, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -242,11 +242,11 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
-  store double %ret, double* undef
+  store double %ret, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -263,7 +263,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> i
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
-  store double %ret, double addrspace(1)* %out, align 8
+  store double %ret, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -292,11 +292,11 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
-  store double %ret, double* undef
+  store double %ret, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -313,7 +313,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
-  store double %ret, double addrspace(1)* %out, align 8
+  store double %ret, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -342,11 +342,11 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
-  store double %ret, double* undef
+  store double %ret, ptr undef
   ret void
 }
 
-define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) {
+define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
 ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
@@ -363,11 +363,11 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> i
 ; GFX90A-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
-  store double %ret, double addrspace(1)* %out, align 8
+  store double %ret, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_f64_noret(double addrspace(1)* %ptr, double %data) {
+define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -378,11 +378,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret(double addrspace(1)* %pt
 ; GFX90A-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %ptr, double %data) {
+define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmin_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -393,11 +393,11 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %pt
 ; GFX90A-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %ptr, double %data) {
+define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmax_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -408,11 +408,11 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %pt
 ; GFX90A-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(double addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -439,11 +439,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(double addrspace(1)*
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(double addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -456,11 +456,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(double addrspa
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(double addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -487,11 +487,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(double addrsp
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("one-as") seq_cst
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(double addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -504,11 +504,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(double addrspa
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
   ret void
 }
 
-define double @global_atomic_fadd_f64_rtn(double addrspace(1)* %ptr, double %data) {
+define double @global_atomic_fadd_f64_rtn(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -516,11 +516,11 @@ define double @global_atomic_fadd_f64_rtn(double addrspace(1)* %ptr, double %dat
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret double %ret
 }
 
-define double @global_atomic_fadd_f64_rtn_pat(double addrspace(1)* %ptr, double %data) #1 {
+define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %data) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -547,11 +547,11 @@ define double @global_atomic_fadd_f64_rtn_pat(double addrspace(1)* %ptr, double
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
   ret double %ret
 }
 
-define double @global_atomic_fadd_f64_rtn_pat_agent(double addrspace(1)* %ptr, double %data) #1 {
+define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, double %data) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -563,11 +563,11 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(double addrspace(1)* %ptr, d
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
   ret double %ret
 }
 
-define double @global_atomic_fadd_f64_rtn_pat_system(double addrspace(1)* %ptr, double %data) #1 {
+define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, double %data) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_system:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -594,11 +594,11 @@ define double @global_atomic_fadd_f64_rtn_pat_system(double addrspace(1)* %ptr,
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("one-as") seq_cst
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
   ret double %ret
 }
 
-define double @global_atomic_fmax_f64_rtn(double addrspace(1)* %ptr, double %data) {
+define double @global_atomic_fmax_f64_rtn(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmax_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -606,11 +606,11 @@ define double @global_atomic_fmax_f64_rtn(double addrspace(1)* %ptr, double %dat
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret double %ret
 }
 
-define double @global_atomic_fmin_f64_rtn(double addrspace(1)* %ptr, double %data) {
+define double @global_atomic_fmin_f64_rtn(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmin_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -618,11 +618,11 @@ define double @global_atomic_fmin_f64_rtn(double addrspace(1)* %ptr, double %dat
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
   ret double %ret
 }
 
-define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(double addrspace(1)* %ptr) {
+define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -647,11 +647,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(double ad
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
+  %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(double* %ptr) #1 {
+define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -678,11 +678,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(double* %ptr) #1 {
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
+  %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(double* %ptr) #1 {
+define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -696,11 +696,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(double* %ptr) #1
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
+  %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(double* %ptr) #1 {
+define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -728,11 +728,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(double* %ptr) #
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("one-as") seq_cst
+  %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst
   ret void
 }
 
-define double @flat_atomic_fadd_f64_rtn_pat(double* %ptr) #1 {
+define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -759,11 +759,11 @@ define double @flat_atomic_fadd_f64_rtn_pat(double* %ptr) #1 {
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
+  %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst
   ret double %ret
 }
 
-define double @flat_atomic_fadd_f64_rtn_pat_agent(double* %ptr) #1 {
+define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -775,11 +775,11 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(double* %ptr) #1 {
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
+  %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst
   ret double %ret
 }
 
-define double @flat_atomic_fadd_f64_rtn_pat_system(double* %ptr) #1 {
+define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -807,11 +807,11 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(double* %ptr) #1 {
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("one-as") seq_cst
+  %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst
   ret double %ret
 }
 
-define amdgpu_kernel void @flat_atomic_fadd_f64_noret(double* %ptr, double %data) {
+define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -823,11 +823,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret(double* %ptr, double %data
 ; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
   ret void
 }
 
-define double @flat_atomic_fadd_f64_rtn(double* %ptr, double %data) {
+define double @flat_atomic_fadd_f64_rtn(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -835,11 +835,11 @@ define double @flat_atomic_fadd_f64_rtn(double* %ptr, double %data) {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
   ret double %ret
 }
 
-define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(double* %ptr) {
+define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -864,11 +864,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(double* %pt
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
+  %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_fmin_f64_noret(double* %ptr, double %data) {
+define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -880,11 +880,11 @@ define amdgpu_kernel void @flat_atomic_fmin_f64_noret(double* %ptr, double %data
 ; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
   ret void
 }
 
-define double @flat_atomic_fmin_f64_rtn(double* %ptr, double %data) {
+define double @flat_atomic_fmin_f64_rtn(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmin_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -892,11 +892,11 @@ define double @flat_atomic_fmin_f64_rtn(double* %ptr, double %data) {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
   ret double %ret
 }
 
-define amdgpu_kernel void @flat_atomic_fmax_f64_noret(double* %ptr, double %data) {
+define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -908,11 +908,11 @@ define amdgpu_kernel void @flat_atomic_fmax_f64_noret(double* %ptr, double %data
 ; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
   ret void
 }
 
-define double @flat_atomic_fmax_f64_rtn(double* %ptr, double %data) {
+define double @flat_atomic_fmax_f64_rtn(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmax_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -920,11 +920,11 @@ define double @flat_atomic_fmax_f64_rtn(double* %ptr, double %data) {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
   ret double %ret
 }
 
-define amdgpu_kernel void @local_atomic_fadd_f64_noret(double addrspace(3)* %ptr, double %data) {
+define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) {
 ; GFX90A-LABEL: local_atomic_fadd_f64_noret:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x24
@@ -935,11 +935,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(double addrspace(3)* %ptr
 ; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0)
+  %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
   ret void
 }
 
-define double @local_atomic_fadd_f64_rtn(double addrspace(3)* %ptr, double %data) {
+define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -949,11 +949,11 @@ define double @local_atomic_fadd_f64_rtn(double addrspace(3)* %ptr, double %data
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0)
+  %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
   ret double %ret
 }
 
-define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(double addrspace(3)* %ptr, double %data) {
+define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) {
 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x24
@@ -964,11 +964,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(doubl
 ; GFX90A-NEXT:    ds_add_f64 v2, v[0:1]
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data)
   ret void
 }
 
-define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(double addrspace(3)* %ptr, double %data) {
+define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -978,11 +978,11 @@ define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(double addrspace(3)
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %ptr, double %data)
+  %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data)
   ret double %ret
 }
 
-define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(double addrspace(3)* %ptr) #1 {
+define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
@@ -995,11 +995,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(double addrspace(3)*
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+  %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret void
 }
 
-define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(double addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
@@ -1012,11 +1012,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(double addrspac
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+  %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret void
 }
 
-define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(double addrspace(3)* %ptr) #4 {
+define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x24
@@ -1040,11 +1040,11 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(double add
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm
 main_body:
-  %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+  %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret void
 }
 
-define double @local_atomic_fadd_f64_rtn_pat(double addrspace(3)* %ptr, double %data) #1 {
+define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data) #1 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1055,11 +1055,11 @@ define double @local_atomic_fadd_f64_rtn_pat(double addrspace(3)* %ptr, double %
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+  %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret double %ret
 }
 
-define double @local_atomic_fadd_f64_rtn_ieee_unsafe(double addrspace(3)* %ptr, double %data) #2 {
+define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, double %data) #2 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1069,11 +1069,11 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(double addrspace(3)* %ptr,
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0)
+  %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
   ret double %ret
 }
 
-define double @local_atomic_fadd_f64_rtn_ieee_safe(double addrspace(3)* %ptr, double %data) #3 {
+define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double %data) #3 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1083,7 +1083,7 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(double addrspace(3)* %ptr, do
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 main_body:
-  %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0)
+  %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
   ret double %ret
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 26daa586d55c..78065579fc79 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -6,7 +6,7 @@
 ; GCN: v_and_b32_e32 v0, 1, v0
 ; GCN: buffer_store_byte v0, off
 define void @void_func_i1(i1 %arg0) #0 {
-  store i1 %arg0, i1 addrspace(1)* undef
+  store i1 %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -18,7 +18,7 @@ define void @void_func_i1(i1 %arg0) #0 {
 define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
   %ext = zext i1 %arg0 to i32
   %add = add i32 %ext, 12
-  store i32 %add, i32 addrspace(1)* undef
+  store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -30,7 +30,7 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 define void @void_func_i1_signext(i1 signext %arg0) #0 {
   %ext = sext i1 %arg0 to i32
   %add = add i32 %ext, 12
-  store i32 %add, i32 addrspace(1)* undef
+  store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -43,7 +43,7 @@ bb:
   br i1 %arg, label %bb2, label %bb1
 
 bb1:
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %bb2
 
 bb2:
@@ -54,7 +54,7 @@ bb2:
 ; GCN-NOT: v0
 ; GCN: buffer_store_byte v0, off
 define void @void_func_i8(i8 %arg0) #0 {
-  store i8 %arg0, i8 addrspace(1)* undef
+  store i8 %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -64,7 +64,7 @@ define void @void_func_i8(i8 %arg0) #0 {
 define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
   %ext = zext i8 %arg0 to i32
   %add = add i32 %ext, 12
-  store i32 %add, i32 addrspace(1)* undef
+  store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -74,14 +74,14 @@ define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
 define void @void_func_i8_signext(i8 signext %arg0) #0 {
   %ext = sext i8 %arg0 to i32
   %add = add i32 %ext, 12
-  store i32 %add, i32 addrspace(1)* undef
+  store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}void_func_i16:
 ; GCN: buffer_store_short v0, off
 define void @void_func_i16(i16 %arg0) #0 {
-  store i16 %arg0, i16 addrspace(1)* undef
+  store i16 %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -91,7 +91,7 @@ define void @void_func_i16(i16 %arg0) #0 {
 define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
   %ext = zext i16 %arg0 to i32
   %add = add i32 %ext, 12
-  store i32 %add, i32 addrspace(1)* undef
+  store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -101,7 +101,7 @@ define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
 define void @void_func_i16_signext(i16 signext %arg0) #0 {
   %ext = sext i16 %arg0 to i32
   %add = add i32 %ext, 12
-  store i32 %add, i32 addrspace(1)* undef
+  store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -109,7 +109,7 @@ define void @void_func_i16_signext(i16 signext %arg0) #0 {
 ; GCN-NOT: v0
 ; GCN: buffer_store_dword v0, off
 define void @void_func_i32(i32 %arg0) #0 {
-  store i32 %arg0, i32 addrspace(1)* undef
+  store i32 %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -119,7 +119,7 @@ define void @void_func_i32(i32 %arg0) #0 {
 ; GCN-NOT: v1
 ; GCN: buffer_store_dwordx2 v[0:1], off
 define void @void_func_i64(i64 %arg0) #0 {
-  store i64 %arg0, i64 addrspace(1)* undef
+  store i64 %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -128,7 +128,7 @@ define void @void_func_i64(i64 %arg0) #0 {
 ; CI: v_cvt_f16_f32_e32 v0, v0
 ; GCN: buffer_store_short v0, off
 define void @void_func_f16(half %arg0) #0 {
-  store half %arg0, half addrspace(1)* undef
+  store half %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -136,7 +136,7 @@ define void @void_func_f16(half %arg0) #0 {
 ; GCN-NOT: v0
 ; GCN: buffer_store_dword v0, off
 define void @void_func_f32(float %arg0) #0 {
-  store float %arg0, float addrspace(1)* undef
+  store float %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -146,7 +146,7 @@ define void @void_func_f32(float %arg0) #0 {
 ; GCN-NOT: v1
 ; GCN: buffer_store_dwordx2 v[0:1], off
 define void @void_func_f64(double %arg0) #0 {
-  store double %arg0, double addrspace(1)* undef
+  store double %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -156,21 +156,21 @@ define void @void_func_f64(double %arg0) #0 {
 ; GCN-NOT: v1
 ; GCN: buffer_store_dwordx2 v[0:1], off
 define void @void_func_v2i32(<2 x i32> %arg0) #0 {
-  store <2 x i32> %arg0, <2 x i32> addrspace(1)* undef
+  store <2 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}void_func_v3i32:
 ; GCN-DAG: buffer_store_dwordx3 v[0:2], off
 define void @void_func_v3i32(<3 x i32> %arg0) #0 {
-  store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef
+  store <3 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}void_func_v4i32:
 ; GCN: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v4i32(<4 x i32> %arg0) #0 {
-  store <4 x i32> %arg0, <4 x i32> addrspace(1)* undef
+  store <4 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -178,7 +178,7 @@ define void @void_func_v4i32(<4 x i32> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
 ; GCN-DAG: buffer_store_dword v4, off
 define void @void_func_v5i32(<5 x i32> %arg0) #0 {
-  store <5 x i32> %arg0, <5 x i32> addrspace(1)* undef
+  store <5 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -186,7 +186,7 @@ define void @void_func_v5i32(<5 x i32> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v8i32(<8 x i32> %arg0) #0 {
-  store <8 x i32> %arg0, <8 x i32> addrspace(1)* undef
+  store <8 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -196,7 +196,7 @@ define void @void_func_v8i32(<8 x i32> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
 define void @void_func_v16i32(<16 x i32> %arg0) #0 {
-  store <16 x i32> %arg0, <16 x i32> addrspace(1)* undef
+  store <16 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -210,7 +210,7 @@ define void @void_func_v16i32(<16 x i32> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[24:27], off
 ; GCN-DAG: buffer_store_dwordx4 v[28:31], off
 define void @void_func_v32i32(<32 x i32> %arg0) #0 {
-  store <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store <32 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -228,14 +228,14 @@ define void @void_func_v32i32(<32 x i32> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[28:31], off
 ; GCN: buffer_store_dword [[STACKLOAD]], off
 define void @void_func_v33i32(<33 x i32> %arg0) #0 {
-  store <33 x i32> %arg0, <33 x i32> addrspace(1)* undef
+  store <33 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}void_func_v2i64:
 ; GCN: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v2i64(<2 x i64> %arg0) #0 {
-  store <2 x i64> %arg0, <2 x i64> addrspace(1)* undef
+  store <2 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -243,7 +243,7 @@ define void @void_func_v2i64(<2 x i64> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
 ; GCN-DAG: buffer_store_dwordx2 v[4:5], off
 define void @void_func_v3i64(<3 x i64> %arg0) #0 {
-  store <3 x i64> %arg0, <3 x i64> addrspace(1)* undef
+  store <3 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -251,7 +251,7 @@ define void @void_func_v3i64(<3 x i64> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v4i64(<4 x i64> %arg0) #0 {
-  store <4 x i64> %arg0, <4 x i64> addrspace(1)* undef
+  store <4 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -260,7 +260,7 @@ define void @void_func_v4i64(<4 x i64> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
 ; GCN-DAG: buffer_store_dwordx2 v[8:9], off
 define void @void_func_v5i64(<5 x i64> %arg0) #0 {
-  store <5 x i64> %arg0, <5 x i64> addrspace(1)* undef
+  store <5 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -270,7 +270,7 @@ define void @void_func_v5i64(<5 x i64> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
 define void @void_func_v8i64(<8 x i64> %arg0) #0 {
-  store <8 x i64> %arg0, <8 x i64> addrspace(1)* undef
+  store <8 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -284,7 +284,7 @@ define void @void_func_v8i64(<8 x i64> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[24:27], off
 ; GCN-DAG: buffer_store_dwordx4 v[28:31], off
 define void @void_func_v16i64(<16 x i64> %arg0) #0 {
-  store <16 x i64> %arg0, <16 x i64> addrspace(1)* undef
+  store <16 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -292,7 +292,7 @@ define void @void_func_v16i64(<16 x i64> %arg0) #0 {
 ; GFX9-NOT: v0
 ; GFX9: buffer_store_dword v0, off
 define void @void_func_v2i16(<2 x i16> %arg0) #0 {
-  store <2 x i16> %arg0, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -300,7 +300,7 @@ define void @void_func_v2i16(<2 x i16> %arg0) #0 {
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off
 define void @void_func_v3i16(<3 x i16> %arg0) #0 {
-  store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef
+  store <3 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -309,7 +309,7 @@ define void @void_func_v3i16(<3 x i16> %arg0) #0 {
 ; GFX9-NOT: v1
 ; GFX9: buffer_store_dwordx2 v[0:1], off
 define void @void_func_v4i16(<4 x i16> %arg0) #0 {
-  store <4 x i16> %arg0, <4 x i16> addrspace(1)* undef
+  store <4 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -326,14 +326,14 @@ define void @void_func_v4i16(<4 x i16> %arg0) #0 {
 ; GFX89-DAG: buffer_store_dwordx2 v[0:1], off
 
 define void @void_func_v5i16(<5 x i16> %arg0) #0 {
-  store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef
+  store <5 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}void_func_v8i16:
 ; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v8i16(<8 x i16> %arg0) #0 {
-  store <8 x i16> %arg0, <8 x i16> addrspace(1)* undef
+  store <8 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -341,7 +341,7 @@ define void @void_func_v8i16(<8 x i16> %arg0) #0 {
 ; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
 ; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v16i16(<16 x i16> %arg0) #0 {
-  store <16 x i16> %arg0, <16 x i16> addrspace(1)* undef
+  store <16 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -351,7 +351,7 @@ define void @void_func_v2i24(<2 x i24> %arg0) #0 {
   %elt0 = extractelement <2 x i24> %arg0, i32 0
   %elt1 = extractelement <2 x i24> %arg0, i32 1
   %add = add i24 %elt0, %elt1
-  store i24 %add, i24 addrspace(1)* undef
+  store i24 %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -361,21 +361,21 @@ define void @void_func_v2i24(<2 x i24> %arg0) #0 {
 ; GCN-NOT: v1
 ; GCN: buffer_store_dwordx2 v[0:1], off
 define void @void_func_v2f32(<2 x float> %arg0) #0 {
-  store <2 x float> %arg0, <2 x float> addrspace(1)* undef
+  store <2 x float> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}void_func_v3f32:
 ; GCN-DAG: buffer_store_dwordx3 v[0:2], off
 define void @void_func_v3f32(<3 x float> %arg0) #0 {
-  store <3 x float> %arg0, <3 x float> addrspace(1)* undef
+  store <3 x float> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}void_func_v4f32:
 ; GCN: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v4f32(<4 x float> %arg0) #0 {
-  store <4 x float> %arg0, <4 x float> addrspace(1)* undef
+  store <4 x float> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -383,7 +383,7 @@ define void @void_func_v4f32(<4 x float> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v8f32(<8 x float> %arg0) #0 {
-  store <8 x float> %arg0, <8 x float> addrspace(1)* undef
+  store <8 x float> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -393,14 +393,14 @@ define void @void_func_v8f32(<8 x float> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
 define void @void_func_v16f32(<16 x float> %arg0) #0 {
-  store <16 x float> %arg0, <16 x float> addrspace(1)* undef
+  store <16 x float> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}void_func_v2f64:
 ; GCN: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v2f64(<2 x double> %arg0) #0 {
-  store <2 x double> %arg0, <2 x double> addrspace(1)* undef
+  store <2 x double> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -408,7 +408,7 @@ define void @void_func_v2f64(<2 x double> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
 ; GCN-DAG: buffer_store_dwordx2 v[4:5], off
 define void @void_func_v3f64(<3 x double> %arg0) #0 {
-  store <3 x double> %arg0, <3 x double> addrspace(1)* undef
+  store <3 x double> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -416,7 +416,7 @@ define void @void_func_v3f64(<3 x double> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v4f64(<4 x double> %arg0) #0 {
-  store <4 x double> %arg0, <4 x double> addrspace(1)* undef
+  store <4 x double> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -426,7 +426,7 @@ define void @void_func_v4f64(<4 x double> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
 define void @void_func_v8f64(<8 x double> %arg0) #0 {
-  store <8 x double> %arg0, <8 x double> addrspace(1)* undef
+  store <8 x double> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -440,7 +440,7 @@ define void @void_func_v8f64(<8 x double> %arg0) #0 {
 ; GCN-DAG: buffer_store_dwordx4 v[24:27], off
 ; GCN-DAG: buffer_store_dwordx4 v[28:31], off
 define void @void_func_v16f64(<16 x double> %arg0) #0 {
-  store <16 x double> %arg0, <16 x double> addrspace(1)* undef
+  store <16 x double> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -448,7 +448,7 @@ define void @void_func_v16f64(<16 x double> %arg0) #0 {
 ; GFX9-NOT: v0
 ; GFX9: buffer_store_dword v0, off
 define void @void_func_v2f16(<2 x half> %arg0) #0 {
-  store <2 x half> %arg0, <2 x half> addrspace(1)* undef
+  store <2 x half> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -464,7 +464,7 @@ define void @void_func_v2f16(<2 x half> %arg0) #0 {
 ; GCN-DAG: buffer_store_short
 ; GCN-DAG: buffer_store_dword
 define void @void_func_v3f16(<3 x half> %arg0) #0 {
-  store <3 x half> %arg0, <3 x half> addrspace(1)* undef
+  store <3 x half> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -474,7 +474,7 @@ define void @void_func_v3f16(<3 x half> %arg0) #0 {
 ; GFX9-NOT: v[0:1]
 ; GFX9: buffer_store_dwordx2 v[0:1], off
 define void @void_func_v4f16(<4 x half> %arg0) #0 {
-  store <4 x half> %arg0, <4 x half> addrspace(1)* undef
+  store <4 x half> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -483,7 +483,7 @@ define void @void_func_v4f16(<4 x half> %arg0) #0 {
 ; GFX9-NOT: v1
 ; GFX9: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v8f16(<8 x half> %arg0) #0 {
-  store <8 x half> %arg0, <8 x half> addrspace(1)* undef
+  store <8 x half> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -493,7 +493,7 @@ define void @void_func_v8f16(<8 x half> %arg0) #0 {
 ; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
 ; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v16f16(<16 x half> %arg0) #0 {
-  store <16 x half> %arg0, <16 x half> addrspace(1)* undef
+  store <16 x half> %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -504,9 +504,9 @@ define void @void_func_v16f16(<16 x half> %arg0) #0 {
 ; GCN: buffer_store_dwordx2 v[1:2]
 ; GCN: buffer_store_dword v3
 define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
-  store volatile i32 %arg0, i32 addrspace(1)* undef
-  store volatile i64 %arg1, i64 addrspace(1)* undef
-  store volatile i32 %arg2, i32 addrspace(1)* undef
+  store volatile i32 %arg0, ptr addrspace(1) undef
+  store volatile i64 %arg1, ptr addrspace(1) undef
+  store volatile i32 %arg2, ptr addrspace(1) undef
   ret void
 }
 
@@ -514,7 +514,7 @@ define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
 ; GCN-NOT: v0
 ; GCN: buffer_store_dword v0, off
 define void @void_func_struct_i32({ i32 } %arg0) #0 {
-  store { i32 } %arg0, { i32 } addrspace(1)* undef
+  store { i32 } %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -522,7 +522,7 @@ define void @void_func_struct_i32({ i32 } %arg0) #0 {
 ; GCN-DAG: buffer_store_byte v0, off
 ; GCN-DAG: buffer_store_dword v1, off
 define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
-  store { i8, i32 } %arg0, { i8, i32 } addrspace(1)* undef
+  store { i8, i32 } %arg0, ptr addrspace(1) undef
   ret void
 }
 
@@ -531,9 +531,9 @@ define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
 ; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
 ; GCN-DAG: buffer_store_dword v[[ELT1]]
 ; GCN-DAG: buffer_store_byte v[[ELT0]]
-define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0) #0 {
-  %arg0.load = load { i8, i32 }, { i8, i32 } addrspace(5)* %arg0
-  store { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
+define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 {
+  %arg0.load = load { i8, i32 }, ptr addrspace(5) %arg0
+  store { i8, i32 } %arg0.load, ptr addrspace(1) undef
   ret void
 }
 
@@ -545,12 +545,12 @@ define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8,
 
 ; GCN: ds_write_b32 v0, v0
 ; GCN: s_setpc_b64
-define void @void_func_byval_struct_i8_i32_x2({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0, { i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg1, i32 %arg2) #0 {
-  %arg0.load = load volatile { i8, i32 }, { i8, i32 } addrspace(5)* %arg0
-  %arg1.load = load volatile { i8, i32 }, { i8, i32 } addrspace(5)* %arg1
-  store volatile { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
-  store volatile { i8, i32 } %arg1.load, { i8, i32 } addrspace(1)* undef
-  store volatile i32 %arg2, i32 addrspace(3)* undef
+define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }) %arg0, ptr addrspace(5) byval({ i8, i32 }) %arg1, i32 %arg2) #0 {
+  %arg0.load = load volatile { i8, i32 }, ptr addrspace(5) %arg0
+  %arg1.load = load volatile { i8, i32 }, ptr addrspace(5) %arg1
+  store volatile { i8, i32 } %arg0.load, ptr addrspace(1) undef
+  store volatile { i8, i32 } %arg1.load, ptr addrspace(1) undef
+  store volatile i32 %arg2, ptr addrspace(3) undef
   ret void
 }
 
@@ -560,11 +560,11 @@ define void @void_func_byval_struct_i8_i32_x2({ i8, i32 } addrspace(5)* byval({
 ; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
 ; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off
 ; GCN-DAG: buffer_store_dwordx2 v[[[ARG1_LOAD0]]:[[ARG1_LOAD1]]], off
-define void @void_func_byval_i32_byval_i64(i32 addrspace(5)* byval(i32) %arg0, i64 addrspace(5)* byval(i64) %arg1) #0 {
-  %arg0.load = load i32, i32 addrspace(5)* %arg0
-  %arg1.load = load i64, i64 addrspace(5)* %arg1
-  store i32 %arg0.load, i32 addrspace(1)* undef
-  store i64 %arg1.load, i64 addrspace(1)* undef
+define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, ptr addrspace(5) byval(i64) %arg1) #0 {
+  %arg0.load = load i32, ptr addrspace(5) %arg0
+  %arg1.load = load i64, ptr addrspace(5) %arg1
+  store i32 %arg0.load, ptr addrspace(1) undef
+  store i64 %arg1.load, ptr addrspace(1) undef
   ret void
 }
 
@@ -585,9 +585,9 @@ define void @void_func_byval_i32_byval_i64(i32 addrspace(5)* byval(i32) %arg0, i
 ; GCN: buffer_store_dword v[[LOAD_ARG1]]
 ; GCN: buffer_store_dwordx2 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]], off
 define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 {
-  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
-  store volatile i32 %arg1, i32 addrspace(1)* undef
-  store volatile i64 %arg2, i64 addrspace(1)* undef
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile i32 %arg1, ptr addrspace(1) undef
+  store volatile i64 %arg2, ptr addrspace(1) undef
   ret void
 }
 
@@ -612,11 +612,11 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 
 ; CI: buffer_store_short [[CVT_ARG4]], off
 define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 {
-  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
-  store volatile i1 %arg1, i1 addrspace(1)* undef
-  store volatile i8 %arg2, i8 addrspace(1)* undef
-  store volatile i16 %arg3, i16 addrspace(1)* undef
-  store volatile half %arg4, half addrspace(1)* undef
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg1, ptr addrspace(1) undef
+  store volatile i8 %arg2, ptr addrspace(1) undef
+  store volatile i16 %arg3, ptr addrspace(1) undef
+  store volatile half %arg4, ptr addrspace(1) undef
   ret void
 }
 
@@ -629,9 +629,9 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GCN: buffer_store_dwordx2 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]], off
 ; GCN: buffer_store_dwordx2 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]], off
 define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
-  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
-  store volatile <2 x i32> %arg1, <2 x i32> addrspace(1)* undef
-  store volatile <2 x float> %arg2, <2 x float> addrspace(1)* undef
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile <2 x i32> %arg1, ptr addrspace(1) undef
+  store volatile <2 x float> %arg2, ptr addrspace(1) undef
   ret void
 }
 
@@ -641,9 +641,9 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 ; GFX9: buffer_store_dword [[LOAD_ARG1]], off
 ; GFX9: buffer_store_short [[LOAD_ARG2]], off
 define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 {
-  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
-  store volatile <2 x i16> %arg1, <2 x i16> addrspace(1)* undef
-  store volatile <2 x half> %arg2, <2 x half> addrspace(1)* undef
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile <2 x i16> %arg1, ptr addrspace(1) undef
+  store volatile <2 x half> %arg2, ptr addrspace(1) undef
   ret void
 }
 
@@ -661,9 +661,9 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off
 ; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off
 define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
-  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
-  store volatile <2 x i64> %arg1, <2 x i64> addrspace(1)* undef
-  store volatile <2 x double> %arg2, <2 x double> addrspace(1)* undef
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile <2 x i64> %arg1, ptr addrspace(1) undef
+  store volatile <2 x double> %arg2, ptr addrspace(1) undef
   ret void
 }
 
@@ -681,9 +681,9 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off
 ; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off
 define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
-  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
-  store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* undef
-  store volatile <4 x float> %arg2, <4 x float> addrspace(1)* undef
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile <4 x i32> %arg1, ptr addrspace(1) undef
+  store volatile <4 x float> %arg2, ptr addrspace(1) undef
   ret void
 }
 
@@ -711,9 +711,9 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
 ; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]], off
 ; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off
 define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 {
-  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
-  store volatile <8 x i32> %arg1, <8 x i32> addrspace(1)* undef
-  store volatile <8 x float> %arg2, <8 x float> addrspace(1)* undef
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile <8 x i32> %arg1, ptr addrspace(1) undef
+  store volatile <8 x float> %arg2, ptr addrspace(1) undef
   ret void
 }
 
@@ -752,9 +752,9 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:120{{$}}
 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:124{{$}}
 define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
-  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
-  store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef
-  store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile <16 x i32> %arg1, ptr addrspace(1) undef
+  store volatile <16 x float> %arg2, ptr addrspace(1) undef
   ret void
 }
 
@@ -771,10 +771,10 @@ define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
   %arg0.0 = extractelement <3 x float> %arg0, i32 0
   %arg0.1 = extractelement <3 x float> %arg0, i32 1
   %arg0.2 = extractelement <3 x float> %arg0, i32 2
-  store volatile float %arg0.0, float addrspace(3)* undef
-  store volatile float %arg0.1, float addrspace(3)* undef
-  store volatile float %arg0.2, float addrspace(3)* undef
-  store volatile i32 %arg1, i32 addrspace(3)* undef
+  store volatile float %arg0.0, ptr addrspace(3) undef
+  store volatile float %arg0.1, ptr addrspace(3) undef
+  store volatile float %arg0.2, ptr addrspace(3) undef
+  store volatile i32 %arg1, ptr addrspace(3) undef
   ret void
 }
 
@@ -790,25 +790,25 @@ define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
   %arg0.0 = extractelement <3 x i32> %arg0, i32 0
   %arg0.1 = extractelement <3 x i32> %arg0, i32 1
   %arg0.2 = extractelement <3 x i32> %arg0, i32 2
-  store volatile i32 %arg0.0, i32 addrspace(3)* undef
-  store volatile i32 %arg0.1, i32 addrspace(3)* undef
-  store volatile i32 %arg0.2, i32 addrspace(3)* undef
-  store volatile i32 %arg1, i32 addrspace(3)* undef
+  store volatile i32 %arg0.0, ptr addrspace(3) undef
+  store volatile i32 %arg0.1, ptr addrspace(3) undef
+  store volatile i32 %arg0.2, ptr addrspace(3) undef
+  store volatile i32 %arg1, ptr addrspace(3) undef
   ret void
 }
 
 ; Check there is no crash.
 ; GCN-LABEL: {{^}}void_func_v16i8:
 define void @void_func_v16i8(<16 x i8> %arg0) #0 {
-  store volatile <16 x i8> %arg0, <16 x i8> addrspace(1)* undef
+  store volatile <16 x i8> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; Check there is no crash.
 ; GCN-LABEL: {{^}}void_func_v32i32_v16i8:
 define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
-  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
-  store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef
+  store volatile <32 x i32> %arg0, ptr addrspace(1) undef
+  store volatile <16 x i8> %arg1, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/function-call-relocs.ll b/llvm/test/CodeGen/AMDGPU/function-call-relocs.ll
index f776272fdc7d..47b2e019c6ff 100644
--- a/llvm/test/CodeGen/AMDGPU/function-call-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-call-relocs.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji < %s | FileCheck %s
 
-declare void @func(i32 addrspace(1)* %out)
+declare void @func(ptr addrspace(1) %out)
 
-declare protected void @protected_func(i32 addrspace(1)* %out)
+declare protected void @protected_func(ptr addrspace(1) %out)
 
-declare hidden void @hidden_func(i32 addrspace(1)* %out)
+declare hidden void @hidden_func(ptr addrspace(1) %out)
 
 ; CHECK-LABEL: call_func:
 ; CHECK: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
@@ -12,8 +12,8 @@ declare hidden void @hidden_func(i32 addrspace(1)* %out)
 ; CHECK: s_addc_u32 s[[GOT_ADDR_HI:[0-9]+]], s[[PC_HI]], func at gotpcrel32@hi+12
 ; CHECK: s_load_dwordx2 s[[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]], s[[[GOT_ADDR_LO]]:[[GOT_ADDR_HI]]], 0x0
 ; CHECK: s_swappc_b64 s[{{[0-9]+:[0-9]+}}], s[[[ADDR_LO]]:[[ADDR_HI]]]
-define amdgpu_kernel void @call_func(i32 addrspace(1)* %out) {
-  call void @func(i32 addrspace(1)* %out)
+define amdgpu_kernel void @call_func(ptr addrspace(1) %out) {
+  call void @func(ptr addrspace(1) %out)
   ret void
 }
 
@@ -22,8 +22,8 @@ define amdgpu_kernel void @call_func(i32 addrspace(1)* %out) {
 ; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], protected_func at rel32@lo+4
 ; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], protected_func at rel32@hi+12
 ; CHECK: s_swappc_b64 s[{{[0-9]+:[0-9]+}}], s[[[ADDR_LO]]:[[ADDR_HI]]]
-define amdgpu_kernel void @call_protected_func(i32 addrspace(1)* %out) {
-  call void @protected_func(i32 addrspace(1)* %out)
+define amdgpu_kernel void @call_protected_func(ptr addrspace(1) %out) {
+  call void @protected_func(ptr addrspace(1) %out)
   ret void
 }
 
@@ -32,8 +32,8 @@ define amdgpu_kernel void @call_protected_func(i32 addrspace(1)* %out) {
 ; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], hidden_func at rel32@lo+4
 ; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], hidden_func at rel32@hi+12
 ; CHECK: s_swappc_b64 s[{{[0-9]+:[0-9]+}}], s[[[ADDR_LO]]:[[ADDR_HI]]]
-define amdgpu_kernel void @call_hidden_func(i32 addrspace(1)* %out) {
-  call void @hidden_func(i32 addrspace(1)* %out)
+define amdgpu_kernel void @call_hidden_func(ptr addrspace(1) %out) {
+  call void @hidden_func(ptr addrspace(1) %out)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 8110e04c133f..5dc2d127379d 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -7,7 +7,7 @@
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define i1 @i1_func_void() #0 {
-  %val = load i1, i1 addrspace(1)* undef
+  %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
 
@@ -17,7 +17,7 @@ define i1 @i1_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define zeroext i1 @i1_zeroext_func_void() #0 {
-  %val = load i1, i1 addrspace(1)* undef
+  %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
 
@@ -27,7 +27,7 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1{{$}}
 ; GCN-NEXT: s_setpc_b64
 define signext i1 @i1_signext_func_void() #0 {
-  %val = load i1, i1 addrspace(1)* undef
+  %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
 
@@ -36,7 +36,7 @@ define signext i1 @i1_signext_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define i8 @i8_func_void() #0 {
-  %val = load i8, i8 addrspace(1)* undef
+  %val = load i8, ptr addrspace(1) undef
   ret i8 %val
 }
 
@@ -45,7 +45,7 @@ define i8 @i8_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define zeroext i8 @i8_zeroext_func_void() #0 {
-  %val = load i8, i8 addrspace(1)* undef
+  %val = load i8, ptr addrspace(1) undef
   ret i8 %val
 }
 
@@ -54,7 +54,7 @@ define zeroext i8 @i8_zeroext_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define signext i8 @i8_signext_func_void() #0 {
-  %val = load i8, i8 addrspace(1)* undef
+  %val = load i8, ptr addrspace(1) undef
   ret i8 %val
 }
 
@@ -63,7 +63,7 @@ define signext i8 @i8_signext_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define i16 @i16_func_void() #0 {
-  %val = load i16, i16 addrspace(1)* undef
+  %val = load i16, ptr addrspace(1) undef
   ret i16 %val
 }
 
@@ -72,7 +72,7 @@ define i16 @i16_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define zeroext i16 @i16_zeroext_func_void() #0 {
-  %val = load i16, i16 addrspace(1)* undef
+  %val = load i16, ptr addrspace(1) undef
   ret i16 %val
 }
 
@@ -81,7 +81,7 @@ define zeroext i16 @i16_zeroext_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define signext i16 @i16_signext_func_void() #0 {
-  %val = load i16, i16 addrspace(1)* undef
+  %val = load i16, ptr addrspace(1) undef
   ret i16 %val
 }
 
@@ -90,7 +90,7 @@ define signext i16 @i16_signext_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define i32 @i32_func_void() #0 {
-  %val = load i32, i32 addrspace(1)* undef
+  %val = load i32, ptr addrspace(1) undef
   ret i32 %val
 }
 
@@ -100,7 +100,7 @@ define i32 @i32_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define i48 @i48_func_void() #0 {
-  %val = load i48, i48 addrspace(1)* undef, align 8
+  %val = load i48, ptr addrspace(1) undef, align 8
   ret i48 %val
 }
 
@@ -110,7 +110,7 @@ define i48 @i48_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define zeroext i48 @i48_zeroext_func_void() #0 {
-  %val = load i48, i48 addrspace(1)* undef, align 8
+  %val = load i48, ptr addrspace(1) undef, align 8
   ret i48 %val
 }
 
@@ -120,7 +120,7 @@ define zeroext i48 @i48_zeroext_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define signext i48 @i48_signext_func_void() #0 {
-  %val = load i48, i48 addrspace(1)* undef, align 8
+  %val = load i48, ptr addrspace(1) undef, align 8
   ret i48 %val
 }
 
@@ -157,7 +157,7 @@ define signext i63 @i63_signext_func_void(i63 %val) #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define i64 @i64_func_void() #0 {
-  %val = load i64, i64 addrspace(1)* undef
+  %val = load i64, ptr addrspace(1) undef
   ret i64 %val
 }
 
@@ -167,7 +167,7 @@ define i64 @i64_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define i65 @i65_func_void() #0 {
-  %val = load i65, i65 addrspace(1)* undef
+  %val = load i65, ptr addrspace(1) undef
   ret i65 %val
 }
 
@@ -176,7 +176,7 @@ define i65 @i65_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define float @f32_func_void() #0 {
-  %val = load float, float addrspace(1)* undef
+  %val = load float, ptr addrspace(1) undef
   ret float %val
 }
 
@@ -185,7 +185,7 @@ define float @f32_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define double @f64_func_void() #0 {
-  %val = load double, double addrspace(1)* undef
+  %val = load double, ptr addrspace(1) undef
   ret double %val
 }
 
@@ -194,7 +194,7 @@ define double @f64_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <2 x double> @v2f64_func_void() #0 {
-  %val = load <2 x double>, <2 x double> addrspace(1)* undef
+  %val = load <2 x double>, ptr addrspace(1) undef
   ret <2 x double> %val
 }
 
@@ -203,7 +203,7 @@ define <2 x double> @v2f64_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <2 x i32> @v2i32_func_void() #0 {
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
+  %val = load <2 x i32>, ptr addrspace(1) undef
   ret <2 x i32> %val
 }
 
@@ -212,7 +212,7 @@ define <2 x i32> @v2i32_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <3 x i32> @v3i32_func_void() #0 {
-  %val = load <3 x i32>, <3 x i32> addrspace(1)* undef
+  %val = load <3 x i32>, ptr addrspace(1) undef
   ret <3 x i32> %val
 }
 
@@ -221,7 +221,7 @@ define <3 x i32> @v3i32_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <4 x i32> @v4i32_func_void() #0 {
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
+  %val = load <4 x i32>, ptr addrspace(1) undef
   ret <4 x i32> %val
 }
 
@@ -231,7 +231,7 @@ define <4 x i32> @v4i32_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <5 x i32> @v5i32_func_void() #0 {
-  %val = load volatile <5 x i32>, <5 x i32> addrspace(1)* undef
+  %val = load volatile <5 x i32>, ptr addrspace(1) undef
   ret <5 x i32> %val
 }
 
@@ -241,8 +241,8 @@ define <5 x i32> @v5i32_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <8 x i32> @v8i32_func_void() #0 {
-  %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
-  %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <8 x i32>, ptr addrspace(1) %ptr
   ret <8 x i32> %val
 }
 
@@ -254,8 +254,8 @@ define <8 x i32> @v8i32_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <16 x i32> @v16i32_func_void() #0 {
-  %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
-  %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <16 x i32>, ptr addrspace(1) %ptr
   ret <16 x i32> %val
 }
 
@@ -271,8 +271,8 @@ define <16 x i32> @v16i32_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <32 x i32> @v32i32_func_void() #0 {
-  %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
-  %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <32 x i32>, ptr addrspace(1) %ptr
   ret <32 x i32> %val
 }
 
@@ -281,7 +281,7 @@ define <32 x i32> @v32i32_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <2 x i64> @v2i64_func_void() #0 {
-  %val = load <2 x i64>, <2 x i64> addrspace(1)* undef
+  %val = load <2 x i64>, ptr addrspace(1) undef
   ret <2 x i64> %val
 }
 
@@ -291,8 +291,8 @@ define <2 x i64> @v2i64_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <3 x i64> @v3i64_func_void() #0 {
-  %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(4)* undef
-  %val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <3 x i64>, ptr addrspace(1) %ptr
   ret <3 x i64> %val
 }
 
@@ -302,8 +302,8 @@ define <3 x i64> @v3i64_func_void() #0 {
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <4 x i64> @v4i64_func_void() #0 {
-  %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(4)* undef
-  %val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <4 x i64>, ptr addrspace(1) %ptr
   ret <4 x i64> %val
 }
 
@@ -314,8 +314,8 @@ define <4 x i64> @v4i64_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <5 x i64> @v5i64_func_void() #0 {
-  %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(4)* undef
-  %val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <5 x i64>, ptr addrspace(1) %ptr
   ret <5 x i64> %val
 }
 
@@ -327,8 +327,8 @@ define <5 x i64> @v5i64_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <8 x i64> @v8i64_func_void() #0 {
-  %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(4)* undef
-  %val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <8 x i64>, ptr addrspace(1) %ptr
   ret <8 x i64> %val
 }
 
@@ -344,8 +344,8 @@ define <8 x i64> @v8i64_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define <16 x i64> @v16i64_func_void() #0 {
-  %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(4)* undef
-  %val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <16 x i64>, ptr addrspace(1) %ptr
   ret <16 x i64> %val
 }
 
@@ -354,7 +354,7 @@ define <16 x i64> @v16i64_func_void() #0 {
 ; GFX9-NEXT: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define <2 x i16> @v2i16_func_void() #0 {
-  %val = load <2 x i16>, <2 x i16> addrspace(1)* undef
+  %val = load <2 x i16>, ptr addrspace(1) undef
   ret <2 x i16> %val
 }
 
@@ -363,7 +363,7 @@ define <2 x i16> @v2i16_func_void() #0 {
 ; GFX9-NEXT: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define <3 x i16> @v3i16_func_void() #0 {
-  %val = load <3 x i16>, <3 x i16> addrspace(1)* undef
+  %val = load <3 x i16>, ptr addrspace(1) undef
   ret <3 x i16> %val
 }
 
@@ -372,7 +372,7 @@ define <3 x i16> @v3i16_func_void() #0 {
 ; GFX9-NEXT: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define <4 x i16> @v4i16_func_void() #0 {
-  %val = load <4 x i16>, <4 x i16> addrspace(1)* undef
+  %val = load <4 x i16>, ptr addrspace(1) undef
   ret <4 x i16> %val
 }
 
@@ -381,7 +381,7 @@ define <4 x i16> @v4i16_func_void() #0 {
 ; GFX9-NEXT: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define <4 x half> @v4f16_func_void() #0 {
-  %val = load <4 x half>, <4 x half> addrspace(1)* undef
+  %val = load <4 x half>, ptr addrspace(1) undef
   ret <4 x half> %val
 }
 
@@ -392,8 +392,8 @@ define <4 x half> @v4f16_func_void() #0 {
 ; GFX9-NEXT: s_waitcnt
 ; GFX9-NEXT: s_setpc_b64
 define <5 x i16> @v5i16_func_void() #0 {
-  %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef
-  %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <5 x i16>, ptr addrspace(1) %ptr
   ret <5 x i16> %val
 }
 
@@ -402,8 +402,8 @@ define <5 x i16> @v5i16_func_void() #0 {
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define <8 x i16> @v8i16_func_void() #0 {
-  %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(4)* undef
-  %val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <8 x i16>, ptr addrspace(1) %ptr
   ret <8 x i16> %val
 }
 
@@ -413,8 +413,8 @@ define <8 x i16> @v8i16_func_void() #0 {
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define <16 x i16> @v16i16_func_void() #0 {
-  %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(4)* undef
-  %val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <16 x i16>, ptr addrspace(1) %ptr
   ret <16 x i16> %val
 }
 
@@ -425,8 +425,8 @@ define <16 x i16> @v16i16_func_void() #0 {
 ; GCN-DAG: v14
 ; GCN-DAG: v15
 define <16 x i8> @v16i8_func_void() #0 {
-  %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef
-  %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <16 x i8>, ptr addrspace(1) %ptr
   ret <16 x i8> %val
 }
 
@@ -438,8 +438,8 @@ define <16 x i8> @v16i8_func_void() #0 {
 ; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0
 ; GCN: s_setpc_b64
 define <4  x i8> @v4i8_func_void() #0 {
-  %ptr = load volatile <4  x i8> addrspace(1)*, <4  x i8> addrspace(1)* addrspace(4)* undef
-  %val = load <4  x i8>, <4  x i8> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <4  x i8>, ptr addrspace(1) %ptr
   ret <4  x i8> %val
 }
 
@@ -449,7 +449,7 @@ define <4  x i8> @v4i8_func_void() #0 {
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
 define {i8, i32} @struct_i8_i32_func_void() #0 {
-  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* undef
+  %val = load { i8, i32 }, ptr addrspace(1) undef
   ret { i8, i32 } %val
 }
 
@@ -458,13 +458,13 @@ define {i8, i32} @struct_i8_i32_func_void() #0 {
 ; GCN: buffer_load_dword [[VAL1:v[0-9]+]]
 ; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], 0 offen{{$}}
 ; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], 0 offen offset:4{{$}}
-define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret({ i8, i32 }) %arg0) #0 {
-  %val0 = load volatile i8, i8 addrspace(1)* undef
-  %val1 = load volatile i32, i32 addrspace(1)* undef
-  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0
-  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1
-  store i8 %val0, i8 addrspace(5)* %gep0
-  store i32 %val1, i32 addrspace(5)* %gep1
+define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %arg0) #0 {
+  %val0 = load volatile i8, ptr addrspace(1) undef
+  %val1 = load volatile i32, ptr addrspace(1) undef
+  %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 1
+  store i8 %val0, ptr addrspace(5) %gep0
+  store i32 %val1, ptr addrspace(5) %gep1
   ret void
 }
 
@@ -509,8 +509,8 @@ define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret({ i8, i
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define <33 x i32> @v33i32_func_void() #0 {
-  %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(4)* undef
-  %val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <33 x i32>, ptr addrspace(1) %ptr
   ret <33 x i32> %val
 }
 
@@ -551,8 +551,8 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
-  %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(4)* undef
-  %val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
   ret { <32 x i32>, i32 }%val
 }
 
@@ -593,8 +593,8 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX9: s_waitcnt vmcnt(0)
 ; GFX9-NEXT: s_setpc_b64
 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
-  %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(4)* undef
-  %val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
+  %val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr
   ret { i32, <32 x i32> }%val
 }
 
@@ -605,10 +605,10 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GCN: ds_read_b32 v2,
 ; GCN: ds_read_b32 v3,
 define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 {
-  %load0 = load volatile i32, i32 addrspace(3)* undef
-  %load1 = load volatile i32, i32 addrspace(3)* undef
-  %load2 = load volatile i32, i32 addrspace(3)* undef
-  %load3 = load volatile i32, i32 addrspace(3)* undef
+  %load0 = load volatile i32, ptr addrspace(3) undef
+  %load1 = load volatile i32, ptr addrspace(3) undef
+  %load2 = load volatile i32, ptr addrspace(3) undef
+  %load3 = load volatile i32, ptr addrspace(3) undef
 
   %insert.0 = insertelement <3 x i32> undef, i32 %load0, i32 0
   %insert.1 = insertelement <3 x i32> %insert.0, i32 %load1, i32 1
@@ -624,10 +624,10 @@ define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 {
 ; GCN: ds_read_b32 v2,
 ; GCN: ds_read_b32 v3,
 define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 {
-  %load0 = load volatile float, float addrspace(3)* undef
-  %load1 = load volatile float, float addrspace(3)* undef
-  %load2 = load volatile float, float addrspace(3)* undef
-  %load3 = load volatile i32, i32 addrspace(3)* undef
+  %load0 = load volatile float, ptr addrspace(3) undef
+  %load1 = load volatile float, ptr addrspace(3) undef
+  %load2 = load volatile float, ptr addrspace(3) undef
+  %load3 = load volatile i32, ptr addrspace(3) undef
 
   %insert.0 = insertelement <3 x float> undef, float %load0, i32 0
   %insert.1 = insertelement <3 x float> %insert.0, float %load1, i32 1
@@ -644,16 +644,16 @@ define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 {
 ; GCN: v_mov_b32_e32 [[HIGH_BITS:v[0-9]+]], 0
 ; GCN: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]]
 ; GCN-NEXT: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]]
-define void @void_func_sret_max_known_zero_bits(i8 addrspace(5)* sret(i8) %arg0) #0 {
-  %arg0.int = ptrtoint i8 addrspace(5)* %arg0 to i32
+define void @void_func_sret_max_known_zero_bits(ptr addrspace(5) sret(i8) %arg0) #0 {
+  %arg0.int = ptrtoint ptr addrspace(5) %arg0 to i32
 
   %lshr0 = lshr i32 %arg0.int, 16
   %lshr1 = lshr i32 %arg0.int, 17
   %lshr2 = lshr i32 %arg0.int, 18
 
-  store volatile i32 %lshr0, i32 addrspace(3)* undef
-  store volatile i32 %lshr1, i32 addrspace(3)* undef
-  store volatile i32 %lshr2, i32 addrspace(3)* undef
+  store volatile i32 %lshr0, ptr addrspace(3) undef
+  store volatile i32 %lshr1, ptr addrspace(3) undef
+  store volatile i32 %lshr2, ptr addrspace(3) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
index b1fd6d478234..cef597162893 100644
--- a/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-allocation.ll
@@ -7,7 +7,7 @@
 
 ; These two objects should be allocated at the same constant offsets
 ; from the base.
-define amdgpu_kernel void @alloc_lds_gds(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @alloc_lds_gds(ptr addrspace(1) %out) #1 {
 ; GCN-LABEL: alloc_lds_gds:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 5
@@ -21,15 +21,15 @@ define amdgpu_kernel void @alloc_lds_gds(i32 addrspace(1)* %out) #1 {
 ; GCN-NEXT:    ds_add_u32 v1, v0 offset:12
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_endpgm
-  %gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3
-  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel
-  %gep.lds = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3
-  %val1 = atomicrmw add i32 addrspace(3)* %gep.lds, i32 5 acq_rel
+  %gep.gds = getelementptr [4 x i32], ptr addrspace(2) @gds0, i32 0, i32 3
+  %val0 = atomicrmw add ptr addrspace(2) %gep.gds, i32 5 acq_rel
+  %gep.lds = getelementptr [4 x i32], ptr addrspace(3) @lds0, i32 0, i32 3
+  %val1 = atomicrmw add ptr addrspace(3) %gep.lds, i32 5 acq_rel
   ret void
 }
 
 ; The LDS alignment shouldn't change offset of GDS.
-define amdgpu_kernel void @alloc_lds_gds_align(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @alloc_lds_gds_align(ptr addrspace(1) %out) #1 {
 ; GCN-LABEL: alloc_lds_gds_align:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 5
@@ -45,21 +45,21 @@ define amdgpu_kernel void @alloc_lds_gds_align(i32 addrspace(1)* %out) #1 {
 ; GCN-NEXT:    ds_add_u32 v1, v0 offset:12
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_endpgm
-  %gep.gds = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds0, i32 0, i32 3
-  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds, i32 5 acq_rel
+  %gep.gds = getelementptr [4 x i32], ptr addrspace(2) @gds0, i32 0, i32 3
+  %val0 = atomicrmw add ptr addrspace(2) %gep.gds, i32 5 acq_rel
 
-  %gep.lds0 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds0, i32 0, i32 3
-  %val1 = atomicrmw add i32 addrspace(3)* %gep.lds0, i32 5 acq_rel
+  %gep.lds0 = getelementptr [4 x i32], ptr addrspace(3) @lds0, i32 0, i32 3
+  %val1 = atomicrmw add ptr addrspace(3) %gep.lds0, i32 5 acq_rel
 
-  %gep.lds1 = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds1, i32 0, i32 3
-  %val2 = atomicrmw add i32 addrspace(3)* %gep.lds1, i32 5 acq_rel
+  %gep.lds1 = getelementptr [4 x i32], ptr addrspace(3) @lds1, i32 0, i32 3
+  %val2 = atomicrmw add ptr addrspace(3) %gep.lds1, i32 5 acq_rel
   ret void
 }
 
 @gds_align8 = internal addrspace(2) global [4 x i32] undef, align 8
 @gds_align32 = internal addrspace(2) global [4 x i32] undef, align 32
 
-define amdgpu_kernel void @gds_global_align(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @gds_global_align(ptr addrspace(1) %out) {
 ; GCN-LABEL: gds_global_align:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 5
@@ -74,14 +74,14 @@ define amdgpu_kernel void @gds_global_align(i32 addrspace(1)* %out) {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_wbinvl1
 ; GCN-NEXT:    s_endpgm
-  %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3
-  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
-  %gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3
-  %val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel
+  %gep.gds0 = getelementptr [4 x i32], ptr addrspace(2) @gds_align8, i32 0, i32 3
+  %val0 = atomicrmw add ptr addrspace(2) %gep.gds0, i32 5 acq_rel
+  %gep.gds1 = getelementptr [4 x i32], ptr addrspace(2) @gds_align32, i32 0, i32 3
+  %val1 = atomicrmw add ptr addrspace(2) %gep.gds1, i32 5 acq_rel
   ret void
 }
 
-define amdgpu_kernel void @gds_global_align_plus_attr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @gds_global_align_plus_attr(ptr addrspace(1) %out) #0 {
 ; GCN-LABEL: gds_global_align_plus_attr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 5
@@ -96,17 +96,17 @@ define amdgpu_kernel void @gds_global_align_plus_attr(i32 addrspace(1)* %out) #0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_wbinvl1
 ; GCN-NEXT:    s_endpgm
-  %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align8, i32 0, i32 3
-  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
-  %gep.gds1 = getelementptr [4 x i32], [4 x i32] addrspace(2)* @gds_align32, i32 0, i32 3
-  %val1 = atomicrmw add i32 addrspace(2)* %gep.gds1, i32 5 acq_rel
+  %gep.gds0 = getelementptr [4 x i32], ptr addrspace(2) @gds_align8, i32 0, i32 3
+  %val0 = atomicrmw add ptr addrspace(2) %gep.gds0, i32 5 acq_rel
+  %gep.gds1 = getelementptr [4 x i32], ptr addrspace(2) @gds_align32, i32 0, i32 3
+  %val1 = atomicrmw add ptr addrspace(2) %gep.gds1, i32 5 acq_rel
   ret void
 }
 
 @small.gds = internal addrspace(2) global i8 undef, align 1
 @gds.external = external unnamed_addr addrspace(3) global [0 x i32], align 4
 
-define amdgpu_kernel void @gds_extern_align(i32 addrspace(1)* %out, [4 x i32] addrspace(2)* %gds.arg) #0 {
+define amdgpu_kernel void @gds_extern_align(ptr addrspace(1) %out, ptr addrspace(2) %gds.arg) #0 {
 ; GCN-LABEL: gds_extern_align:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x8
@@ -123,9 +123,9 @@ define amdgpu_kernel void @gds_extern_align(i32 addrspace(1)* %out, [4 x i32] ad
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_wbinvl1
 ; GCN-NEXT:    s_endpgm
-  call void asm sideeffect "; use $0","s"(i8 addrspace(2)* @small.gds)
-  %gep.gds0 = getelementptr [4 x i32], [4 x i32] addrspace(2)* %gds.arg, i32 0, i32 3
-  %val0 = atomicrmw add i32 addrspace(2)* %gep.gds0, i32 5 acq_rel
+  call void asm sideeffect "; use $0","s"(ptr addrspace(2) @small.gds)
+  %gep.gds0 = getelementptr [4 x i32], ptr addrspace(2) %gds.arg, i32 0, i32 3
+  %val0 = atomicrmw add ptr addrspace(2) %gep.gds0, i32 5 acq_rel
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/gds-atomic.ll b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
index abd00d9fbb7f..982d2c1f33dd 100644
--- a/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-atomic.ll
@@ -7,19 +7,19 @@
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
 ; GCN-DAG: s_movk_i32 m0, 0x1000
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds
-define amdgpu_kernel void @atomic_add_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 {
-  %val = atomicrmw volatile add i32 addrspace(2)* %gds, i32 5 acq_rel
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_add_ret_gds(ptr addrspace(1) %out, ptr addrspace(2) %gds) #1 {
+  %val = atomicrmw volatile add ptr addrspace(2) %gds, i32 5 acq_rel
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}atomic_add_ret_gds_const_offset:
 ; GCN: s_movk_i32 m0, 0x80
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 gds
-define amdgpu_kernel void @atomic_add_ret_gds_const_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #0 {
-  %gep = getelementptr i32, i32 addrspace(2)* %gds, i32 5
-  %val = atomicrmw volatile add i32 addrspace(2)* %gep, i32 5 acq_rel
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_add_ret_gds_const_offset(ptr addrspace(1) %out, ptr addrspace(2) %gds) #0 {
+  %gep = getelementptr i32, ptr addrspace(2) %gds, i32 5
+  %val = atomicrmw volatile add ptr addrspace(2) %gep, i32 5 acq_rel
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -27,9 +27,9 @@ define amdgpu_kernel void @atomic_add_ret_gds_const_offset(i32 addrspace(1)* %ou
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
 ; GCN-DAG: s_movk_i32 m0, 0x1000
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds
-define amdgpu_kernel void @atomic_sub_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 {
-  %val = atomicrmw sub i32 addrspace(2)* %gds, i32 5 acq_rel
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_sub_ret_gds(ptr addrspace(1) %out, ptr addrspace(2) %gds) #1 {
+  %val = atomicrmw sub ptr addrspace(2) %gds, i32 5 acq_rel
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -37,9 +37,9 @@ define amdgpu_kernel void @atomic_sub_ret_gds(i32 addrspace(1)* %out, i32 addrsp
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
 ; GCN-DAG: s_movk_i32 m0, 0x1000
 ; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds
-define amdgpu_kernel void @atomic_and_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 {
-  %val = atomicrmw and i32 addrspace(2)* %gds, i32 5 acq_rel
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_and_ret_gds(ptr addrspace(1) %out, ptr addrspace(2) %gds) #1 {
+  %val = atomicrmw and ptr addrspace(2) %gds, i32 5 acq_rel
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -47,9 +47,9 @@ define amdgpu_kernel void @atomic_and_ret_gds(i32 addrspace(1)* %out, i32 addrsp
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
 ; GCN-DAG: s_movk_i32 m0, 0x1000
 ; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds
-define amdgpu_kernel void @atomic_or_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 {
-  %val = atomicrmw or i32 addrspace(2)* %gds, i32 5 acq_rel
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_or_ret_gds(ptr addrspace(1) %out, ptr addrspace(2) %gds) #1 {
+  %val = atomicrmw or ptr addrspace(2) %gds, i32 5 acq_rel
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -57,9 +57,9 @@ define amdgpu_kernel void @atomic_or_ret_gds(i32 addrspace(1)* %out, i32 addrspa
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
 ; GCN-DAG: s_movk_i32 m0, 0x1000
 ; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds
-define amdgpu_kernel void @atomic_xor_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 {
-  %val = atomicrmw xor i32 addrspace(2)* %gds, i32 5 acq_rel
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_xor_ret_gds(ptr addrspace(1) %out, ptr addrspace(2) %gds) #1 {
+  %val = atomicrmw xor ptr addrspace(2) %gds, i32 5 acq_rel
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -67,9 +67,9 @@ define amdgpu_kernel void @atomic_xor_ret_gds(i32 addrspace(1)* %out, i32 addrsp
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
 ; GCN-DAG: s_movk_i32 m0, 0x1000
 ; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds
-define amdgpu_kernel void @atomic_umin_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 {
-  %val = atomicrmw umin i32 addrspace(2)* %gds, i32 5 acq_rel
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_umin_ret_gds(ptr addrspace(1) %out, ptr addrspace(2) %gds) #1 {
+  %val = atomicrmw umin ptr addrspace(2) %gds, i32 5 acq_rel
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -77,9 +77,9 @@ define amdgpu_kernel void @atomic_umin_ret_gds(i32 addrspace(1)* %out, i32 addrs
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
 ; GCN-DAG: s_movk_i32 m0, 0x1000
 ; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds
-define amdgpu_kernel void @atomic_umax_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 {
-  %val = atomicrmw umax i32 addrspace(2)* %gds, i32 5 acq_rel
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_umax_ret_gds(ptr addrspace(1) %out, ptr addrspace(2) %gds) #1 {
+  %val = atomicrmw umax ptr addrspace(2) %gds, i32 5 acq_rel
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -87,9 +87,9 @@ define amdgpu_kernel void @atomic_umax_ret_gds(i32 addrspace(1)* %out, i32 addrs
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
 ; GCN-DAG: s_movk_i32 m0, 0x1000
 ; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds
-define amdgpu_kernel void @atomic_imin_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 {
-  %val = atomicrmw min i32 addrspace(2)* %gds, i32 5 acq_rel
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_imin_ret_gds(ptr addrspace(1) %out, ptr addrspace(2) %gds) #1 {
+  %val = atomicrmw min ptr addrspace(2) %gds, i32 5 acq_rel
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -97,9 +97,9 @@ define amdgpu_kernel void @atomic_imin_ret_gds(i32 addrspace(1)* %out, i32 addrs
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
 ; GCN-DAG: s_movk_i32 m0, 0x1000
 ; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds
-define amdgpu_kernel void @atomic_imax_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 {
-  %val = atomicrmw max i32 addrspace(2)* %gds, i32 5 acq_rel
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_imax_ret_gds(ptr addrspace(1) %out, ptr addrspace(2) %gds) #1 {
+  %val = atomicrmw max ptr addrspace(2) %gds, i32 5 acq_rel
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -107,9 +107,9 @@ define amdgpu_kernel void @atomic_imax_ret_gds(i32 addrspace(1)* %out, i32 addrs
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
 ; GCN-DAG: s_movk_i32 m0, 0x1000
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v[[OFF]], v{{[0-9]+}} gds
-define amdgpu_kernel void @atomic_xchg_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 {
-  %val = atomicrmw xchg i32 addrspace(2)* %gds, i32 5 acq_rel
-  store i32 %val, i32 addrspace(1)* %out
+define amdgpu_kernel void @atomic_xchg_ret_gds(ptr addrspace(1) %out, ptr addrspace(2) %gds) #1 {
+  %val = atomicrmw xchg ptr addrspace(2) %gds, i32 5 acq_rel
+  store i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -117,10 +117,10 @@ define amdgpu_kernel void @atomic_xchg_ret_gds(i32 addrspace(1)* %out, i32 addrs
 ; GCN-DAG: v_mov_b32_e32 v[[OFF:[0-9]+]], s
 ; GCN-DAG: s_movk_i32 m0, 0x1000
 ; GCN: ds_cmpst_rtn_b32 v{{[0-9]+}}, v[[OFF:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} gds
-define amdgpu_kernel void @atomic_cmpxchg_ret_gds(i32 addrspace(1)* %out, i32 addrspace(2)* %gds) #1 {
-  %val = cmpxchg i32 addrspace(2)* %gds, i32 0, i32 1 acquire acquire
+define amdgpu_kernel void @atomic_cmpxchg_ret_gds(ptr addrspace(1) %out, ptr addrspace(2) %gds) #1 {
+  %val = cmpxchg ptr addrspace(2) %gds, i32 0, i32 1 acquire acquire
   %x = extractvalue { i32, i1 } %val, 0
-  store i32 %x, i32 addrspace(1)* %out
+  store i32 %x, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
index b2fd9f6acf9d..8c9be5018a49 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 
-define amdgpu_kernel void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
+define amdgpu_kernel void @use_gep_address_space(ptr addrspace(3) %array) nounwind {
 ; CHECK-LABEL: {{^}}use_gep_address_space:
 ; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
 ; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64
-  %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16
-  store i32 99, i32 addrspace(3)* %p
+  %p = getelementptr [1024 x i32], ptr addrspace(3) %array, i16 0, i16 16
+  store i32 99, ptr addrspace(3) %p
   ret void
 }
 
@@ -17,9 +17,9 @@ define amdgpu_kernel void @use_gep_address_space([1024 x i32] addrspace(3)* %arr
 ; SI: s_bitset1_b32
 ; CI: s_add_i32
 ; CHECK: ds_write_b32
-define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
-  %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
-  store i32 99, i32 addrspace(3)* %p
+define amdgpu_kernel void @use_gep_address_space_large_offset(ptr addrspace(3) %array) nounwind {
+  %p = getelementptr [1024 x i32], ptr addrspace(3) %array, i16 0, i16 16384
+  store i32 99, ptr addrspace(3) %p
   ret void
 }
 
@@ -39,16 +39,16 @@ define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrs
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_endpgm
-define amdgpu_kernel void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
-  %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
-  %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
-  %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
-  %p2 = extractelement <4 x i32 addrspace(3)*> %p, i32 2
-  %p3 = extractelement <4 x i32 addrspace(3)*> %p, i32 3
-  store i32 99, i32 addrspace(3)* %p0
-  store i32 99, i32 addrspace(3)* %p1
-  store i32 99, i32 addrspace(3)* %p2
-  store i32 99, i32 addrspace(3)* %p3
+define amdgpu_kernel void @gep_as_vector_v4(<4 x ptr addrspace(3)> %array) nounwind {
+  %p = getelementptr [1024 x i32], <4 x ptr addrspace(3)> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  %p0 = extractelement <4 x ptr addrspace(3)> %p, i32 0
+  %p1 = extractelement <4 x ptr addrspace(3)> %p, i32 1
+  %p2 = extractelement <4 x ptr addrspace(3)> %p, i32 2
+  %p3 = extractelement <4 x ptr addrspace(3)> %p, i32 3
+  store i32 99, ptr addrspace(3) %p0
+  store i32 99, ptr addrspace(3) %p1
+  store i32 99, ptr addrspace(3) %p2
+  store i32 99, ptr addrspace(3) %p3
   ret void
 }
 
@@ -60,12 +60,12 @@ define amdgpu_kernel void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %ar
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_endpgm
-define amdgpu_kernel void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
-  %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
-  %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
-  %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
-  store i32 99, i32 addrspace(3)* %p0
-  store i32 99, i32 addrspace(3)* %p1
+define amdgpu_kernel void @gep_as_vector_v2(<2 x ptr addrspace(3)> %array) nounwind {
+  %p = getelementptr [1024 x i32], <2 x ptr addrspace(3)> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
+  %p0 = extractelement <2 x ptr addrspace(3)> %p, i32 0
+  %p1 = extractelement <2 x ptr addrspace(3)> %p, i32 1
+  store i32 99, ptr addrspace(3) %p0
+  store i32 99, ptr addrspace(3) %p1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
index acebbc05641e..deea14b9e866 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx90a < %s | FileCheck %s
 
-declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* nocapture, double) #8
+declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr nocapture, double) #8
 
-define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, double addrspace(1)* %b, double %c) {
+define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b, double %c) {
 ; CHECK-LABEL: IllegalGEPConst:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x24
@@ -21,9 +21,9 @@ define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, double addrspace(1)
 entry:
   %i = add nsw i32 %a, -1
   %i.2 = sext i32 %i to i64
-  %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2
-  %i.4 = addrspacecast double addrspace(1)* %i.3 to double*
-  %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.4, double %c) #8
+  %i.3 = getelementptr inbounds double, ptr addrspace(1) %b, i64 %i.2
+  %i.4 = addrspacecast ptr addrspace(1) %i.3 to ptr
+  %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %i.4, double %c) #8
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index cdba7a1b2b8e..56db9083be9f 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -88,8 +88,8 @@ declare hidden amdgpu_gfx i32 @external_i32_func_i32(i32) #0
 
 ; Structs
 declare hidden amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 }) #0
-declare hidden amdgpu_gfx void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8, i32 })) #0
-declare hidden amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret({ i8, i32 }), { i8, i32 } addrspace(5)* byval({ i8, i32 })) #0
+declare hidden amdgpu_gfx void @external_void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 })) #0
+declare hidden amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }), ptr addrspace(5) byval({ i8, i32 })) #0
 
 declare hidden amdgpu_gfx void @external_void_func_v16i8(<16 x i8>) #0
 
@@ -336,7 +336,7 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %var = load volatile i1, i1 addrspace(1)* undef
+  %var = load volatile i1, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_i1_signext(i1 signext%var)
   ret void
 }
@@ -464,7 +464,7 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %var = load volatile i1, i1 addrspace(1)* undef
+  %var = load volatile i1, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_i1_zeroext(i1 zeroext %var)
   ret void
 }
@@ -701,7 +701,7 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %var = load volatile i8, i8 addrspace(1)* undef
+  %var = load volatile i8, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_i8_signext(i8 signext %var)
   ret void
 }
@@ -822,7 +822,7 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %var = load volatile i8, i8 addrspace(1)* undef
+  %var = load volatile i8, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_i8_zeroext(i8 zeroext %var)
   ret void
 }
@@ -1059,7 +1059,7 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %var = load volatile i16, i16 addrspace(1)* undef
+  %var = load volatile i16, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_i16_signext(i16 signext %var)
   ret void
 }
@@ -1180,7 +1180,7 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %var = load volatile i16, i16 addrspace(1)* undef
+  %var = load volatile i16, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_i16_zeroext(i16 zeroext %var)
   ret void
 }
@@ -1540,7 +1540,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x i64>, <2 x i64> addrspace(1)* null
+  %val = load <2 x i64>, ptr addrspace(1) null
   call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> %val)
   ret void
 }
@@ -1797,7 +1797,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <2 x i64>, <2 x i64> addrspace(1)* null
+  %load = load <2 x i64>, ptr addrspace(1) null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
 
   call amdgpu_gfx void @external_void_func_v3i64(<3 x i64> %val)
@@ -1937,7 +1937,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <2 x i64>, <2 x i64> addrspace(1)* null
+  %load = load <2 x i64>, ptr addrspace(1) null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call amdgpu_gfx void @external_void_func_v4i64(<4 x i64> %val)
   ret void
@@ -3037,7 +3037,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x i16>, <2 x i16> addrspace(1)* undef
+  %val = load <2 x i16>, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_v2i16(<2 x i16> %val)
   ret void
 }
@@ -3154,7 +3154,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <3 x i16>, <3 x i16> addrspace(1)* undef
+  %val = load <3 x i16>, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> %val)
   ret void
 }
@@ -3271,7 +3271,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <3 x half>, <3 x half> addrspace(1)* undef
+  %val = load <3 x half>, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_v3f16(<3 x half> %val)
   ret void
 }
@@ -3627,7 +3627,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <4 x i16>, <4 x i16> addrspace(1)* undef
+  %val = load <4 x i16>, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> %val)
   ret void
 }
@@ -3864,7 +3864,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x half>, <2 x half> addrspace(1)* undef
+  %val = load <2 x half>, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_v2f16(<2 x half> %val)
   ret void
 }
@@ -3981,7 +3981,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
+  %val = load <2 x i32>, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> %val)
   ret void
 }
@@ -4466,7 +4466,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
+  %val = load <4 x i32>, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> %val)
   ret void
 }
@@ -4858,8 +4858,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
-  %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
+  %ptr = load ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <8 x i32>, ptr addrspace(1) %ptr
   call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> %val)
   ret void
 }
@@ -5143,8 +5143,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
-  %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
+  %ptr = load ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <16 x i32>, ptr addrspace(1) %ptr
   call amdgpu_gfx void @external_void_func_v16i32(<16 x i32> %val)
   ret void
 }
@@ -5305,8 +5305,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
-  %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
+  %ptr = load ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <32 x i32>, ptr addrspace(1) %ptr
   call amdgpu_gfx void @external_void_func_v32i32(<32 x i32> %val)
   ret void
 }
@@ -5478,14 +5478,14 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
-  %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0
-  %val1 = load i32, i32 addrspace(1)* undef
+  %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
+  %val0 = load <32 x i32>, ptr addrspace(1) %ptr0
+  %val1 = load i32, ptr addrspace(1) undef
   call amdgpu_gfx void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1)
   ret void
 }
 
-define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
+define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_external_i32_func_i32_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -5632,7 +5632,7 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42)
-  store volatile i32 %val, i32 addrspace(1)* %out
+  store volatile i32 %val, ptr addrspace(1) %out
   ret void
 }
 
@@ -5767,8 +5767,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
-  %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
+  %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
+  %val = load { i8, i32 }, ptr addrspace(1) %ptr0
   call amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 } %val)
   ret void
 }
@@ -5902,11 +5902,11 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %val = alloca { i8, i32 }, align 4, addrspace(5)
-  %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 0
-  %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 1
-  store i8 3, i8 addrspace(5)* %gep0
-  store i32 8, i32 addrspace(5)* %gep1
-  call amdgpu_gfx void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %val)
+  %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 0
+  %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 1
+  store i8 3, ptr addrspace(5) %gep0
+  store i32 8, ptr addrspace(5) %gep1
+  call amdgpu_gfx void @external_void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %val)
   ret void
 }
 
@@ -6077,18 +6077,18 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
   %in.val = alloca { i8, i32 }, align 4, addrspace(5)
   %out.val = alloca { i8, i32 }, align 4, addrspace(5)
-  %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 0
-  %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 1
-  store i8 3, i8 addrspace(5)* %in.gep0
-  store i32 8, i32 addrspace(5)* %in.gep1
-  call amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret({ i8, i32 }) %out.val, { i8, i32 } addrspace(5)* byval({ i8, i32 }) %in.val)
-  %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 0
-  %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 1
-  %out.val0 = load i8, i8 addrspace(5)* %out.gep0
-  %out.val1 = load i32, i32 addrspace(5)* %out.gep1
+  %in.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 0
+  %in.gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 1
+  store i8 3, ptr addrspace(5) %in.gep0
+  store i32 8, ptr addrspace(5) %in.gep1
+  call amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %out.val, ptr addrspace(5) byval({ i8, i32 }) %in.val)
+  %out.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %out.val, i32 0, i32 0
+  %out.gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %out.val, i32 0, i32 1
+  %out.val0 = load i8, ptr addrspace(5) %out.gep0
+  %out.val1 = load i32, ptr addrspace(5) %out.gep1
 
-  store volatile i8 %out.val0, i8 addrspace(1)* undef
-  store volatile i32 %out.val1, i32 addrspace(1)* undef
+  store volatile i8 %out.val0, ptr addrspace(1) undef
+  store volatile i32 %out.val1, ptr addrspace(1) undef
   ret void
 }
 
@@ -6291,8 +6291,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef
-  %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
+  %ptr = load ptr addrspace(1), ptr addrspace(4) undef
+  %val = load <16 x i8>, ptr addrspace(1) %ptr
   call amdgpu_gfx void @external_void_func_v16i8(<16 x i8> %val)
   ret void
 }
@@ -6671,7 +6671,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %alloca = alloca double, align 8, addrspace(5)
-  tail call amdgpu_gfx void @byval_align16_f64_arg(<32 x i32> %val, double addrspace(5)* byval(double) align 16 %alloca)
+  tail call amdgpu_gfx void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca)
   ret void
 }
 
@@ -7452,7 +7452,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x i64>, <2 x i64> addrspace(4)* null
+  %val = load <2 x i64>, ptr addrspace(4) null
   call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg %val)
   ret void
 }
@@ -7789,7 +7789,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <2 x i64>, <2 x i64> addrspace(4)* null
+  %load = load <2 x i64>, ptr addrspace(4) null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
 
   call amdgpu_gfx void @external_void_func_v3i64_inreg(<3 x i64> inreg %val)
@@ -7992,7 +7992,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <2 x i64>, <2 x i64> addrspace(4)* null
+  %load = load <2 x i64>, ptr addrspace(4) null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> inreg %val)
   ret void
@@ -9302,7 +9302,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x i16>, <2 x i16> addrspace(4)* undef
+  %val = load <2 x i16>, ptr addrspace(4) undef
   call amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> inreg %val)
   ret void
 }
@@ -9435,7 +9435,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <3 x i16>, <3 x i16> addrspace(4)* undef
+  %val = load <3 x i16>, ptr addrspace(4) undef
   call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg %val)
   ret void
 }
@@ -9568,7 +9568,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <3 x half>, <3 x half> addrspace(4)* undef
+  %val = load <3 x half>, ptr addrspace(4) undef
   call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg %val)
   ret void
 }
@@ -9973,7 +9973,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <4 x i16>, <4 x i16> addrspace(4)* undef
+  %val = load <4 x i16>, ptr addrspace(4) undef
   call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg %val)
   ret void
 }
@@ -10234,7 +10234,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x half>, <2 x half> addrspace(4)* undef
+  %val = load <2 x half>, ptr addrspace(4) undef
   call amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> inreg %val)
   ret void
 }
@@ -10367,7 +10367,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x i32>, <2 x i32> addrspace(4)* undef
+  %val = load <2 x i32>, ptr addrspace(4) undef
   call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg %val)
   ret void
 }
@@ -10960,7 +10960,7 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <4 x i32>, <4 x i32> addrspace(4)* undef
+  %val = load <4 x i32>, ptr addrspace(4) undef
   call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg %val)
   ret void
 }
@@ -11481,8 +11481,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %ptr = load <8 x i32> addrspace(4)*, <8 x i32> addrspace(4)* addrspace(4)* undef
-  %val = load <8 x i32>, <8 x i32> addrspace(4)* %ptr
+  %ptr = load ptr addrspace(4), ptr addrspace(4) undef
+  %val = load <8 x i32>, ptr addrspace(4) %ptr
   call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg %val)
   ret void
 }
@@ -11943,8 +11943,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %ptr = load <16 x i32> addrspace(4)*, <16 x i32> addrspace(4)* addrspace(4)* undef
-  %val = load <16 x i32>, <16 x i32> addrspace(4)* %ptr
+  %ptr = load ptr addrspace(4), ptr addrspace(4) undef
+  %val = load <16 x i32>, ptr addrspace(4) %ptr
   call amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> inreg %val)
   ret void
 }
@@ -12366,8 +12366,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %ptr = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef
-  %val = load <32 x i32>, <32 x i32> addrspace(4)* %ptr
+  %ptr = load ptr addrspace(4), ptr addrspace(4) undef
+  %val = load <32 x i32>, ptr addrspace(4) %ptr
   call amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> inreg %val)
   ret void
 }
@@ -12807,9 +12807,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
-  %ptr0 = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef
-  %val0 = load <32 x i32>, <32 x i32> addrspace(4)* %ptr0
-  %val1 = load i32, i32 addrspace(4)* undef
+  %ptr0 = load ptr addrspace(4), ptr addrspace(4) undef
+  %val0 = load <32 x i32>, ptr addrspace(4) %ptr0
+  %val1 = load i32, ptr addrspace(4) undef
   call amdgpu_gfx void @external_void_func_v32i32_i32_inreg(<32 x i32> inreg %val0, i32 inreg %val1)
   ret void
 }
@@ -13776,7 +13776,7 @@ entry:
   ret void
 }
 
-declare hidden amdgpu_gfx void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval(double) align 16) #0
+declare hidden amdgpu_gfx void @byval_align16_f64_arg(<32 x i32>, ptr addrspace(5) byval(double) align 16) #0
 declare hidden amdgpu_gfx void @stack_passed_f64_arg(<32 x i32>, double) #0
 declare hidden amdgpu_gfx void @external_void_func_12xv3i32(<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>,
     <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>) #0

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
index e48d4e5a0ee3..ea5add023d15 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
@@ -46,7 +46,7 @@
 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 0
 define amdgpu_kernel void @minimal_kernel_inputs() {
   %id = call i32 @llvm.amdgcn.workgroup.id.x()
-  store volatile i32 %id, i32 addrspace(1)* undef
+  store volatile i32 %id, ptr addrspace(1) undef
   ret void
 }
 
@@ -75,8 +75,8 @@ define amdgpu_kernel void @minimal_kernel_inputs() {
 define amdgpu_kernel void @minimal_kernel_inputs_with_stack() {
   %alloca = alloca i32, addrspace(5)
   %id = call i32 @llvm.amdgcn.workgroup.id.x()
-  store volatile i32 %id, i32 addrspace(1)* undef
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 %id, ptr addrspace(1) undef
+  store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }
 
@@ -105,10 +105,10 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() {
 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 2
 define amdgpu_kernel void @queue_ptr() {
-  %queue.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
-  %load = load volatile i8, i8 addrspace(4)* %queue.ptr
+  %queue.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
+  %load = load volatile i8, ptr addrspace(4) %queue.ptr
   %id = call i32 @llvm.amdgcn.workgroup.id.x()
-  store volatile i32 %id, i32 addrspace(1)* undef
+  store volatile i32 %id, ptr addrspace(1) undef
   ret void
 }
 
@@ -152,28 +152,28 @@ define amdgpu_kernel void @queue_ptr() {
 ; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 8
 define amdgpu_kernel void @all_inputs() {
   %alloca = alloca i32, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 0, ptr addrspace(5) %alloca
 
-  %dispatch.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %load.dispatch = load volatile i8, i8 addrspace(4)* %dispatch.ptr
+  %dispatch.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %load.dispatch = load volatile i8, ptr addrspace(4) %dispatch.ptr
 
-  %queue.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
-  %load.queue = load volatile i8, i8 addrspace(4)* %queue.ptr
+  %queue.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+  %load.queue = load volatile i8, ptr addrspace(4) %queue.ptr
 
-  %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
-  %load.implicitarg = load volatile i8, i8 addrspace(4)* %implicitarg.ptr
+  %implicitarg.ptr = call noalias ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %load.implicitarg = load volatile i8, ptr addrspace(4) %implicitarg.ptr
 
   %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
-  store volatile i32 %id.x, i32 addrspace(1)* undef
+  store volatile i32 %id.x, ptr addrspace(1) undef
 
   %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
-  store volatile i32 %id.y, i32 addrspace(1)* undef
+  store volatile i32 %id.y, ptr addrspace(1) undef
 
   %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
-  store volatile i32 %id.z, i32 addrspace(1)* undef
+  store volatile i32 %id.z, ptr addrspace(1) undef
 
   %dispatch.id = call i64 @llvm.amdgcn.dispatch.id()
-  store volatile i64 %dispatch.id, i64 addrspace(1)* undef
+  store volatile i64 %dispatch.id, ptr addrspace(1) undef
 
   ret void
 }
@@ -181,10 +181,10 @@ define amdgpu_kernel void @all_inputs() {
 declare i32 @llvm.amdgcn.workgroup.id.x() #0
 declare i32 @llvm.amdgcn.workgroup.id.y() #0
 declare i32 @llvm.amdgcn.workgroup.id.z() #0
-declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
-declare align 4 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
-declare align 4 i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
-declare align 4 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0
+declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
+declare align 4 ptr addrspace(4) @llvm.amdgcn.queue.ptr() #0
+declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #0
 declare i64 @llvm.amdgcn.dispatch.id() #0
 
 attributes #0 = { nounwind readnone speculatable willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx902-without-xnack.ll b/llvm/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
index b88fabba5ed5..04909dad5862 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx902-without-xnack.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 --amdhsa-code-object-version=2 -mattr=-xnack < %s | FileCheck %s
 
 ; CHECK: .hsa_code_object_isa 9,0,2,"AMD","AMDGPU"
-define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
-  store float 0.0, float addrspace(1)* %out0
+define amdgpu_kernel void @test_kernel(ptr addrspace(1) %out0, ptr addrspace(1) %out1) nounwind {
+  store float 0.0, ptr addrspace(1) %out0
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
index 2f5440de30f0..553b9542a200 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
@@ -8,11 +8,11 @@
 define amdgpu_kernel void @test(<4 x i32> %x) #0 {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %r1 = tail call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %x, i32 %id, i32 0, i32 0, i32 0)
-  store volatile <4 x float> %r1, <4 x float>* undef
+  store volatile <4 x float> %r1, ptr undef
   %r2 = tail call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32> %x, i32 %id, i32 0, i32 0, i32 0)
-  store volatile <4 x half> %r2, <4 x half>* undef
+  store volatile <4 x half> %r2, ptr undef
   %r3 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %x, i32 0, i32 0, i32 0)
-  store <4 x i32> %r3, <4 x i32>* undef
+  store <4 x i32> %r3, ptr undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
index ec7b7d428d6f..79de55eb63bf 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
@@ -5,60 +5,56 @@
 ; amdgcn atomic csub
 ; --------------------------------------------------------------------------------
 
-define amdgpu_ps float @global_csub_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_csub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_csub_saddr_i32_rtn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_csub v0, v0, v1, s[2:3] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %rtn = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %rtn = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep0, i32 %data)
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps float @global_csub_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps float @global_csub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_csub_saddr_i32_rtn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_csub v0, v0, v1, s[2:3] offset:-128 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %rtn = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %rtn = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep1, i32 %data)
   %cast.rtn = bitcast i32 %rtn to float
   ret float %cast.rtn
 }
 
-define amdgpu_ps void @global_csub_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_csub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_csub_saddr_i32_nortn:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_csub v0, v0, v1, s[2:3] glc
 ; GCN-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
-  %unused = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %unused = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep0, i32 %data)
   ret void
 }
 
-define amdgpu_ps void @global_csub_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
+define amdgpu_ps void @global_csub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
 ; GCN-LABEL: global_csub_saddr_i32_nortn_neg128:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    global_atomic_csub v0, v0, v1, s[2:3] offset:-128 glc
 ; GCN-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
-  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
-  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
-  %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
-  %unused = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data)
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  %unused = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep1, i32 %data)
   ret void
 }
 
-declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #0
+declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #0
 
 attributes #0 = { argmemonly nounwind willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll b/llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll
index 56be32145fda..b38758bae537 100644
--- a/llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll
@@ -4,13 +4,13 @@
 ; GCN: flat_load_dword
 ; GCN: flat_load_dword
 ; GCN: flat_store_dword
-define void @unknown_memdep_analysis(float addrspace(1)* nocapture readonly %arg, float %arg1) #0 {
+define void @unknown_memdep_analysis(ptr addrspace(1) nocapture readonly %arg, float %arg1) #0 {
 bb:
-  %tmp53 = load float, float addrspace(1)* undef, align 4
-  %tmp54 = getelementptr inbounds float, float addrspace(1)* %arg, i32 31
-  %tmp55 = load float, float addrspace(1)* %tmp54, align 4
+  %tmp53 = load float, ptr addrspace(1) undef, align 4
+  %tmp54 = getelementptr inbounds float, ptr addrspace(1) %arg, i32 31
+  %tmp55 = load float, ptr addrspace(1) %tmp54, align 4
   %tmp56 = tail call float @llvm.fmuladd.f32(float %arg1, float %tmp53, float %tmp55)
-  store float %tmp56, float addrspace(1)* undef, align 4
+  store float %tmp56, ptr addrspace(1) undef, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
index 1617c3bb9fe3..26f80234ffff 100644
--- a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -15,11 +15,11 @@
 ; EG: @float_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @float(ptr addrspace(1) %out, i32 %index) {
 entry:
-  %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(4)* @float_gv, i32 0, i32 %index
-  %1 = load float, float addrspace(4)* %0
-  store float %1, float addrspace(1)* %out
+  %0 = getelementptr inbounds [5 x float], ptr addrspace(4) @float_gv, i32 0, i32 %index
+  %1 = load float, ptr addrspace(4) %0
+  store float %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -33,11 +33,11 @@ entry:
 ; EG: @i32_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define amdgpu_kernel void @i32(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @i32(ptr addrspace(1) %out, i32 %index) {
 entry:
-  %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(4)* @i32_gv, i32 0, i32 %index
-  %1 = load i32, i32 addrspace(4)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %0 = getelementptr inbounds [5 x i32], ptr addrspace(4) @i32_gv, i32 0, i32 %index
+  %1 = load i32, ptr addrspace(4) %0
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -53,10 +53,10 @@ entry:
 ; EG: @struct_foo_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(4)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
-  %load = load i32, i32 addrspace(4)* %gep, align 4
-  store i32 %load, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @struct_foo_gv_load(ptr addrspace(1) %out, i32 %index) {
+  %gep = getelementptr inbounds [1 x %struct.foo], ptr addrspace(4) @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
+  %load = load i32, ptr addrspace(4) %gep, align 4
+  store i32 %load, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -72,10 +72,10 @@ define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index
 ; EG: @array_v1_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define amdgpu_kernel void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
-  %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(4)* @array_v1_gv, i32 0, i32 %index
-  %load = load <1 x i32>, <1 x i32> addrspace(4)* %gep, align 4
-  store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
+define amdgpu_kernel void @array_v1_gv_load(ptr addrspace(1) %out, i32 %index) {
+  %gep = getelementptr inbounds [4 x <1 x i32>], ptr addrspace(4) @array_v1_gv, i32 0, i32 %index
+  %load = load <1 x i32>, ptr addrspace(4) %gep, align 4
+  store <1 x i32> %load, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -84,19 +84,19 @@ define amdgpu_kernel void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %i
 ; EG: VTX_READ_32
 ; EG: @float_gv
 ; EG-NOT: MOVA_INT
-define amdgpu_kernel void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
+define amdgpu_kernel void @gv_addressing_in_branch(ptr addrspace(1) %out, i32 %index, i32 %a) {
 entry:
   %0 = icmp eq i32 0, %a
   br i1 %0, label %if, label %else
 
 if:
-  %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(4)* @float_gv, i32 0, i32 %index
-  %2 = load float, float addrspace(4)* %1
-  store float %2, float addrspace(1)* %out
+  %1 = getelementptr inbounds [5 x float], ptr addrspace(4) @float_gv, i32 0, i32 %index
+  %2 = load float, ptr addrspace(4) %1
+  store float %2, ptr addrspace(1) %out
   br label %endif
 
 else:
-  store float 1.0, float addrspace(1)* %out
+  store float 1.0, ptr addrspace(1) %out
   br label %endif
 
 endif:

diff  --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
index 15bca082fffc..4cce4bdb446f 100644
--- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
@@ -12,19 +12,19 @@
 
 ; CHECK-LABEL: {{^}}dynamic_shared_array_0:
 ; CHECK: v_add_u32_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
-define amdgpu_kernel void @dynamic_shared_array_0(float addrspace(1)* %out) {
+define amdgpu_kernel void @dynamic_shared_array_0(ptr addrspace(1) %out) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %tid.x
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
-  store float %val0, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds0, i32 0, i32 %tid.x
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
+  store float %val0, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}dynamic_shared_array_1:
 ; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0xc00
 ; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
-define amdgpu_kernel void @dynamic_shared_array_1(float addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @dynamic_shared_array_1(ptr addrspace(1) %out, i32 %cond) {
 entry:
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %idx.0 = add nsw i32 %tid.x, 64
@@ -32,19 +32,19 @@ entry:
   br i1 %tmp, label %if, label %else
 
 if:                                               ; preds = %entry
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   br label %endif
 
 else:                                             ; preds = %entry
-  %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
-  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [256 x float], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
+  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
   br label %endif
 
 endif:                                            ; preds = %else, %if
   %val = phi float [ %val0, %if ], [ %val1, %else ]
-  %arrayidx = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
-  store float %val, float addrspace(3)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
+  store float %val, ptr addrspace(3) %arrayidx, align 4
   ret void
 }
 
@@ -54,10 +54,10 @@ endif:                                            ; preds = %else, %if
 define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %vidx = add i32 %tid.x, %idx
-  %arrayidx0 = getelementptr inbounds [4096 x float], [4096 x float] addrspace(3)* @lds2, i32 0, i32 %vidx
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
-  store float %val0, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx0 = getelementptr inbounds [4096 x float], ptr addrspace(3) @lds2, i32 0, i32 %vidx
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
+  store float %val0, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
@@ -69,11 +69,11 @@ define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
 define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %vidx = add i32 %tid.x, %idx
-  %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
-  %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [67 x i8], ptr addrspace(3) @lds3, i32 0, i32 %vidx
+  %val0 = load i8, ptr addrspace(3) %arrayidx0, align 4
   %val1 = uitofp i8 %val0 to float
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 
@@ -86,14 +86,14 @@ define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
 define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %vidx = add i32 %tid.x, %idx
-  %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
-  %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [67 x i8], ptr addrspace(3) @lds3, i32 0, i32 %vidx
+  %val0 = load i8, ptr addrspace(3) %arrayidx0, align 4
   %val1 = uitofp i8 %val0 to float
   %val2 = uitofp i8 %val0 to double
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared1, i32 0, i32 %tid.x
-  store double %val2, double addrspace(3)* %arrayidx2, align 4
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_shared1, i32 0, i32 %tid.x
+  store double %val2, ptr addrspace(3) %arrayidx2, align 4
   ret void
 }
 
@@ -105,14 +105,14 @@ define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
 define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %vidx = add i32 %tid.x, %idx
-  %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
-  %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [67 x i8], ptr addrspace(3) @lds3, i32 0, i32 %vidx
+  %val0 = load i8, ptr addrspace(3) %arrayidx0, align 4
   %val1 = uitofp i8 %val0 to float
   %val2 = uitofp i8 %val0 to double
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared2, i32 0, i32 %tid.x
-  store double %val2, double addrspace(3)* %arrayidx2, align 4
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_shared2, i32 0, i32 %tid.x
+  store double %val2, ptr addrspace(3) %arrayidx2, align 4
   ret void
 }
 
@@ -124,24 +124,24 @@ define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
 define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %vidx = add i32 %tid.x, %idx
-  %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
-  %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [67 x i8], ptr addrspace(3) @lds3, i32 0, i32 %vidx
+  %val0 = load i8, ptr addrspace(3) %arrayidx0, align 4
   %val1 = uitofp i8 %val0 to float
   %val2 = uitofp i8 %val0 to double
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared3, i32 0, i32 %tid.x
-  store double %val2, double addrspace(3)* %arrayidx2, align 4
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i32 0, i32 %tid.x
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_shared3, i32 0, i32 %tid.x
+  store double %val2, ptr addrspace(3) %arrayidx2, align 4
   ret void
 }
 
 ; CHECK-LABEL: dynamic_shared_array_with_call:
 ; CHECK-NOT: s_swappc_b64
-define amdgpu_kernel void @dynamic_shared_array_with_call(float addrspace(1)* nocapture readnone %out) local_unnamed_addr {
+define amdgpu_kernel void @dynamic_shared_array_with_call(ptr addrspace(1) nocapture readnone %out) local_unnamed_addr {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %1 = sext i32 %tid.x to i64
-  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i64 0, i64 %1
-  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds0, i64 0, i64 %1
+  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
   tail call void @store_value(float %val0)
   ret void
 }
@@ -151,8 +151,8 @@ define linkonce_odr hidden void @store_value(float %val1) local_unnamed_addr {
 entry:
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %0 = sext i32 %tid.x to i64
-  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i64 0, i64 %0
-  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i64 0, i64 %0
+  store float %val1, ptr addrspace(3) %arrayidx1, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hoist-cond.ll b/llvm/test/CodeGen/AMDGPU/hoist-cond.ll
index cc734e0453ba..b513a5430546 100644
--- a/llvm/test/CodeGen/AMDGPU/hoist-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/hoist-cond.ll
@@ -11,7 +11,7 @@
 ; CHECK: s_and_saveexec_b64 s[{{[0-9]+:[0-9]+}}], [[COND]]
 ; CHECK: ; %bb.2:
 
-define amdgpu_kernel void @hoist_cond(float addrspace(1)* nocapture %arg, float addrspace(1)* noalias nocapture readonly %arg1, i32 %arg3, i32 %arg4) {
+define amdgpu_kernel void @hoist_cond(ptr addrspace(1) nocapture %arg, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %arg3, i32 %arg4) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp5 = icmp ult i32 %tmp, %arg3
@@ -24,8 +24,8 @@ bb1:                                              ; preds = %bb3, %bb
 
 bb2:                                              ; preds = %bb1
   %tmp10 = zext i32 %tmp7 to i64
-  %tmp11 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 %tmp10
-  %tmp12 = load float, float addrspace(1)* %tmp11, align 4
+  %tmp11 = getelementptr inbounds float, ptr addrspace(1) %arg1, i64 %tmp10
+  %tmp12 = load float, ptr addrspace(1) %tmp11, align 4
   br label %bb3
 
 bb3:                                             ; preds = %bb2, %bb1
@@ -36,7 +36,7 @@ bb3:                                             ; preds = %bb2, %bb1
   br i1 %tmp17, label %bb4, label %bb1
 
 bb4:                                             ; preds = %bb3
-  store float %tmp15, float addrspace(1)* %arg, align 4
+  store float %tmp15, ptr addrspace(1) %arg, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll b/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll
index 5d80e99d1b9d..9f41543a4a45 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll
@@ -4,8 +4,8 @@
 ; unsupported device.
 
 ; CHECK: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
-define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
-  store float 0.0, float addrspace(1)* %out0
+define amdgpu_kernel void @test_kernel(ptr addrspace(1) %out0, ptr addrspace(1) %out1) nounwind {
+  store float 0.0, ptr addrspace(1) %out0
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
index fa0f5894cd5b..eb4442a2fd0f 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -4,9 +4,9 @@
 ; GCN: float_mode = 240
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define amdgpu_kernel void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_default_ci(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #0 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -14,9 +14,9 @@ define amdgpu_kernel void @test_default_ci(float addrspace(1)* %out0, double add
 ; GCN: float_mode = 240
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_default_vi(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #1 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -24,9 +24,9 @@ define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double add
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_f64_denormals(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #2 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -34,9 +34,9 @@ define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double
 ; GCN: float_mode = 48
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_f32_denormals(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #3 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -44,9 +44,9 @@ define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double
 ; GCN: float_mode = 240
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_f32_f64_denormals(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #4 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -54,9 +54,9 @@ define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, dou
 ; GCN: float_mode = 0
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_no_denormals(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #5 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -64,9 +64,9 @@ define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double a
 ; GCN: float_mode = 240
 ; GCN: enable_dx10_clamp = 0
 ; GCN: enable_ieee_mode = 1
-define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_no_dx10_clamp_vi(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #6 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -74,9 +74,9 @@ define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, doub
 ; GCN: float_mode = 240
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 0
-define amdgpu_kernel void @test_no_ieee_mode_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_no_ieee_mode_vi(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #7 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 
@@ -84,9 +84,9 @@ define amdgpu_kernel void @test_no_ieee_mode_vi(float addrspace(1)* %out0, doubl
 ; GCN: float_mode = 240
 ; GCN: enable_dx10_clamp = 0
 ; GCN: enable_ieee_mode = 0
-define amdgpu_kernel void @test_no_ieee_mode_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #8 {
-  store float 0.0, float addrspace(1)* %out0
-  store double 0.0, double addrspace(1)* %out1
+define amdgpu_kernel void @test_no_ieee_mode_no_dx10_clamp_vi(ptr addrspace(1) %out0, ptr addrspace(1) %out1) #8 {
+  store float 0.0, ptr addrspace(1) %out0
+  store double 0.0, ptr addrspace(1) %out1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll b/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll
index db0f60e924e3..0ac6ca290407 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-func-align.ll
@@ -10,9 +10,9 @@
 
 ; HSA: .globl simple_align16
 ; HSA: .p2align 5
-define void @simple_align16(i32 addrspace(1)* addrspace(4)* %ptr.out) align 32 {
+define void @simple_align16(ptr addrspace(4) %ptr.out) align 32 {
 entry:
-  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out
-  store i32 0, i32 addrspace(1)* %out
+  %out = load ptr addrspace(1), ptr addrspace(4) %ptr.out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-func.ll b/llvm/test/CodeGen/AMDGPU/hsa-func.ll
index 524668db5051..23389b33c7fc 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-func.ll
@@ -51,19 +51,19 @@
 ; HSA: .size   simple, .Lfunc_end0-simple
 ; HSA: ; Function info:
 ; HSA-NOT: COMPUTE_PGM_RSRC2
-define void @simple(i32 addrspace(1)* addrspace(4)* %ptr.out) {
+define void @simple(ptr addrspace(4) %ptr.out) {
 entry:
-  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out
-  store i32 0, i32 addrspace(1)* %out
+  %out = load ptr addrspace(1), ptr addrspace(4) %ptr.out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }
 
 ; Ignore explicit alignment that is too low.
 ; HSA: .globl simple_align2
 ; HSA: .p2align 2
-define void @simple_align2(i32 addrspace(1)* addrspace(4)* %ptr.out) align 2 {
+define void @simple_align2(ptr addrspace(4) %ptr.out) align 2 {
 entry:
-  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out
-  store i32 0, i32 addrspace(1)* %out
+  %out = load ptr addrspace(1), ptr addrspace(4) %ptr.out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll b/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll
index 600793810e59..04e580c6e118 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll
@@ -5,8 +5,8 @@
 
 define amdgpu_kernel void @test() {
 entry:
-  store i32 0, i32 addrspace(3)* @internal_group
-  store i32 0, i32 addrspace(3)* @external_group
+  store i32 0, ptr addrspace(3) @internal_group
+  store i32 0, ptr addrspace(3) @external_group
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll
index cf874622eca3..5925dd83e905 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-resource-usage-function-ordering.ll
@@ -31,17 +31,17 @@
 ; GFX10:     .sgpr_spill_count: 0
 ; GFX10:     .vgpr_count:     4
 ; GFX10:     .vgpr_spill_count: 0
-define amdgpu_kernel void @test1(float* %x) {
-  %1 = load volatile float, float* %x
+define amdgpu_kernel void @test1(ptr %x) {
+  %1 = load volatile float, ptr %x
   %2 = call float @f(float %1)
-  store volatile float %2, float* %x
+  store volatile float %2, ptr %x
   ret void
 }
 
 define internal float @f(float %arg0) #0 {
   %stack = alloca float, i32 4, align 4, addrspace(5)
-  store volatile float 3.0, float addrspace(5)* %stack
-  %val = load volatile float, float addrspace(5)* %stack
+  store volatile float 3.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
   %add = fadd float %arg0, %val
   ret float %add
 }
@@ -69,10 +69,10 @@ define internal float @f(float %arg0) #0 {
 ; GFX10:     .sgpr_spill_count: 0
 ; GFX10:     .vgpr_count:     4
 ; GFX10:     .vgpr_spill_count: 0
-define amdgpu_kernel void @test2(float* %x) {
-  %1 = load volatile float, float* %x
+define amdgpu_kernel void @test2(ptr %x) {
+  %1 = load volatile float, ptr %x
   %2 = call float @f(float %1)
-  store volatile float %2, float* %x
+  store volatile float %2, ptr %x
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/internalize.ll b/llvm/test/CodeGen/AMDGPU/internalize.ll
index 8bdd3d450ccf..6b2a4d5fc328 100644
--- a/llvm/test/CodeGen/AMDGPU/internalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/internalize.ll
@@ -12,32 +12,32 @@
 
 ; OPT: define internal fastcc void @func_used_noinline(
 ; OPT-NONE: define fastcc void @func_used_noinline(
-define fastcc void @func_used_noinline(i32 addrspace(1)* %out, i32 %tid) #1 {
+define fastcc void @func_used_noinline(ptr addrspace(1) %out, i32 %tid) #1 {
 entry:
-  store volatile i32 %tid, i32 addrspace(1)* %out
+  store volatile i32 %tid, ptr addrspace(1) %out
   ret void
 }
 
 ; OPTNONE: define fastcc void @func_used_alwaysinline(
 ; OPT-NOT: @func_used_alwaysinline
-define fastcc void @func_used_alwaysinline(i32 addrspace(1)* %out, i32 %tid) #2 {
+define fastcc void @func_used_alwaysinline(ptr addrspace(1) %out, i32 %tid) #2 {
 entry:
-  store volatile i32 %tid, i32 addrspace(1)* %out
+  store volatile i32 %tid, ptr addrspace(1) %out
   ret void
 }
 
 ; OPTNONE: define void @func_unused(
 ; OPT-NOT: @func_unused
-define void @func_unused(i32 addrspace(1)* %out, i32 %tid) #1 {
+define void @func_unused(ptr addrspace(1) %out, i32 %tid) #1 {
 entry:
-  store volatile i32 %tid, i32 addrspace(1)* %out
+  store volatile i32 %tid, ptr addrspace(1) %out
   ret void
 }
 
 ; ALL: define amdgpu_kernel void @kernel_unused(
-define amdgpu_kernel void @kernel_unused(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @kernel_unused(ptr addrspace(1) %out) #1 {
 entry:
-  store volatile i32 1, i32 addrspace(1)* %out
+  store volatile i32 1, ptr addrspace(1) %out
   ret void
 }
 
@@ -49,8 +49,8 @@ entry:
 define amdgpu_kernel void @main_kernel() {
 entry:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  tail call fastcc void @func_used_noinline(i32 addrspace(1)* @gvar_used, i32 %tid)
-  tail call fastcc void @func_used_alwaysinline(i32 addrspace(1)* @gvar_used, i32 %tid)
+  tail call fastcc void @func_used_noinline(ptr addrspace(1) @gvar_used, i32 %tid)
+  tail call fastcc void @func_used_alwaysinline(ptr addrspace(1) @gvar_used, i32 %tid)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
index 572ff60fc330..cdf389deafa4 100644
--- a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -10,11 +10,11 @@
 ; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]],
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], [[PTR]]
-define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
-  %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0
-  %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
-  store i16 123, i16 addrspace(1)* %ptr, align 4
-  store i16 456, i16 addrspace(1)* %ptr.1
+define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointer_load(ptr addrspace(1) dereferenceable(4096) nonnull %in) #0 {
+  %ptr = load ptr addrspace(1), ptr addrspace(1) %in, !invariant.load !0
+  %ptr.1 = getelementptr i16, ptr addrspace(1) %ptr, i64 1
+  store i16 123, ptr addrspace(1) %ptr, align 4
+  store i16 456, ptr addrspace(1) %ptr.1
   ret void
 }
 
@@ -22,11 +22,11 @@ define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointe
 ; GCN: s_load_dwordx2 s[[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]]
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], off, s[[[SPTR_LO]]:
-define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(4)* dereferenceable(4096) nonnull %in) #0 {
-  %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(4)* %in, !invariant.load !0
-  %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
-  store i16 123, i16 addrspace(1)* %ptr, align 4
-  store i16 456, i16 addrspace(1)* %ptr.1
+define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(ptr addrspace(4) dereferenceable(4096) nonnull %in) #0 {
+  %ptr = load ptr addrspace(1), ptr addrspace(4) %in, !invariant.load !0
+  %ptr.1 = getelementptr i16, ptr addrspace(1) %ptr, i64 1
+  store i16 123, ptr addrspace(1) %ptr, align 4
+  store i16 456, ptr addrspace(1) %ptr.1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
index f189a0ffffd4..20e06cad85ff 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
@@ -37,12 +37,12 @@ define void @vcc() #0 {
   ret void
 }
 
- at llvm.used = appending global [6 x i8*] [i8* bitcast (void ()* @csr to i8*),
-                                         i8* bitcast (void ()* @subregs_for_super to i8*),
-                                         i8* bitcast (void ()* @clobbered_reg_with_sub to i8*),
-                                         i8* bitcast (void ()* @nothing to i8*),
-                                         i8* bitcast (void ()* @special_regs to i8*),
-                                         i8* bitcast (void ()* @vcc to i8*)]
+ at llvm.used = appending global [6 x ptr] [ptr @csr,
+                                         ptr @subregs_for_super,
+                                         ptr @clobbered_reg_with_sub,
+                                         ptr @nothing,
+                                         ptr @special_regs,
+                                         ptr @vcc]
 
 attributes #0 = { nounwind }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

diff  --git a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
index ec9cf022b389..ef3c95b17598 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
@@ -23,10 +23,10 @@ declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #
 declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
 
 ; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture) #1
+declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #1
 
 ; Function Attrs: norecurse
-define internal fastcc void @svm_node_closure_bsdf(%struct.ShaderData addrspace(1)* %sd, float* %stack, <4 x i32> %node, i32* %offset, i32 %0, i8 %trunc, float %1, float %2, float %mul80, i1 %cmp412.old, <4 x i32> %3, float %4, i32 %5, i1 %cmp440, i1 %cmp442, i1 %or.cond1306, float %.op, %struct.ShaderClosure addrspace(1)* %arrayidx.i.i2202, %struct.ShaderClosure addrspace(1)* %retval.0.i.i22089, %struct.ShaderClosure addrspace(1)* %retval.1.i221310, i1 %cmp575, i32 addrspace(1)* %num_closure_left.i2215, i32 %6, i1 %cmp.i2216, i32 %7, i64 %idx.ext.i2223, i32 %sub5.i2221) #2 {
+define internal fastcc void @svm_node_closure_bsdf(ptr addrspace(1) %sd, ptr %stack, <4 x i32> %node, ptr %offset, i32 %0, i8 %trunc, float %1, float %2, float %mul80, i1 %cmp412.old, <4 x i32> %3, float %4, i32 %5, i1 %cmp440, i1 %cmp442, i1 %or.cond1306, float %.op, ptr addrspace(1) %arrayidx.i.i2202, ptr addrspace(1) %retval.0.i.i22089, ptr addrspace(1) %retval.1.i221310, i1 %cmp575, ptr addrspace(1) %num_closure_left.i2215, i32 %6, i1 %cmp.i2216, i32 %7, i64 %idx.ext.i2223, i32 %sub5.i2221) #2 {
 ; GCN-LABEL: {{^}}svm_node_closure_bsdf:
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR:v[0-9]+]], s30,
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31,
@@ -42,7 +42,7 @@ entry:
   br i1 undef, label %common.ret.critedge, label %cond.true
 
 cond.true:                                        ; preds = %entry
-  %9 = load float, float* null, align 4
+  %9 = load float, ptr null, align 4
   %phi.cmp = fcmp oeq float %9, 0.000000e+00
   br i1 %phi.cmp, label %common.ret, label %cond.true20
 
@@ -63,15 +63,15 @@ LeafBlock:                                        ; preds = %NodeBlock
   br i1 %SwitchLeaf, label %if.end.i.i2285, label %NewDefault
 
 sw.bb:                                            ; preds = %cond.true20
-  %10 = load float, float* null, align 4
-  %11 = load float, float* null, align 4
+  %10 = load float, ptr null, align 4
+  %11 = load float, ptr null, align 4
   %12 = tail call float @llvm.amdgcn.fmed3.f32(float %1, float 0.000000e+00, float 0.000000e+00)
   %mul802 = fmul nsz float %1, 0.000000e+00
   %cmp412.old3 = fcmp nsz ogt float %1, 0.000000e+00
   br i1 %cmp412.old, label %if.then413, label %common.ret
 
 if.then413:                                       ; preds = %sw.bb
-  %13 = load <4 x i32>, <4 x i32> addrspace(1)* null, align 16
+  %13 = load <4 x i32>, ptr addrspace(1) null, align 16
   %14 = extractelement <4 x i32> %node, i64 0
   %cmp4404 = fcmp nsz ole float %1, 0.000000e+00
   %cmp4425 = icmp eq i32 %0, 0
@@ -82,12 +82,12 @@ if.then443:                                       ; preds = %if.then413
   br i1 true, label %if.end511, label %common.ret
 
 common.ret.critedge:                              ; preds = %entry
-  store i32 0, i32* null, align 4
+  store i32 0, ptr null, align 4
   br label %common.ret
 
 NewDefault:                                       ; preds = %LeafBlock1, %LeafBlock
   %phi.store = phi i32 [0, %LeafBlock], [1, %LeafBlock1]
-  store i32 %phi.store, i32* null, align 4
+  store i32 %phi.store, ptr null, align 4
   br label %common.ret
 
 common.ret:                                       ; preds = %if.end.i.i2285, %if.end627.sink.split, %cond.end579, %bsdf_alloc.exit2188, %if.end511, %common.ret.critedge, %if.then443, %sw.bb, %NewDefault, %cond.true
@@ -118,7 +118,7 @@ if.then534:                                       ; preds = %bsdf_alloc.exit2188
   %.op7 = fmul nsz float undef, 0.000000e+00
   %mul558 = select i1 %cmp440, float 0.000000e+00, float %1
   %15 = tail call float @llvm.amdgcn.fmed3.f32(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00)
-  store float %mul558, float addrspace(1)* null, align 4
+  store float %mul558, ptr addrspace(1) null, align 4
   br label %if.end627.sink.split
 
 if.else568:                                       ; preds = %if.then413
@@ -128,57 +128,56 @@ if.then.i2198:                                    ; preds = %if.else568
   br i1 undef, label %closure_alloc.exit.i2210, label %if.end.i.i2207
 
 if.end.i.i2207:                                   ; preds = %if.then.i2198
-  %arrayidx.i.i22028 = getelementptr inbounds %struct.ShaderData, %struct.ShaderData addrspace(1)* %sd, i64 0, i32 30, i64 undef
+  %arrayidx.i.i22028 = getelementptr inbounds %struct.ShaderData, ptr addrspace(1) %sd, i64 0, i32 30, i64 undef
   br label %closure_alloc.exit.i2210
 
 closure_alloc.exit.i2210:                         ; preds = %if.end.i.i2207, %if.then.i2198
-  %retval.0.i.i220899 = phi %struct.ShaderClosure addrspace(1)* [ %arrayidx.i.i2202, %if.end.i.i2207 ], [ null, %if.then.i2198 ]
+  %retval.0.i.i220899 = phi ptr addrspace(1) [ %arrayidx.i.i2202, %if.end.i.i2207 ], [ null, %if.then.i2198 ]
   br i1 false, label %bsdf_alloc.exit2214, label %if.end.i2212
 
 if.end.i2212:                                     ; preds = %closure_alloc.exit.i2210
   br label %bsdf_alloc.exit2214
 
 bsdf_alloc.exit2214:                              ; preds = %if.end.i2212, %closure_alloc.exit.i2210, %if.else568
-  %retval.1.i22131010 = phi %struct.ShaderClosure addrspace(1)* [ %arrayidx.i.i2202, %if.end.i2212 ], [ null, %closure_alloc.exit.i2210 ], [ null, %if.else568 ]
-  %cmp57511 = icmp ne %struct.ShaderClosure addrspace(1)* %arrayidx.i.i2202, null
+  %retval.1.i22131010 = phi ptr addrspace(1) [ %arrayidx.i.i2202, %if.end.i2212 ], [ null, %closure_alloc.exit.i2210 ], [ null, %if.else568 ]
+  %cmp57511 = icmp ne ptr addrspace(1) %arrayidx.i.i2202, null
   br i1 %cmp442, label %cond.true576, label %cond.end579
 
 cond.true576:                                     ; preds = %bsdf_alloc.exit2214
-  %num_closure_left.i221512 = getelementptr inbounds %struct.ShaderData, %struct.ShaderData addrspace(1)* %sd, i64 0, i32 25
-  %16 = load i32, i32 addrspace(1)* %num_closure_left.i2215, align 8
+  %num_closure_left.i221512 = getelementptr inbounds %struct.ShaderData, ptr addrspace(1) %sd, i64 0, i32 25
+  %16 = load i32, ptr addrspace(1) %num_closure_left.i2215, align 8
   %cmp.i221613 = icmp slt i32 %0, 0
   br i1 %cmp440, label %cond.end579, label %if.end.i2227
 
 if.end.i2227:                                     ; preds = %cond.true576
   %sub5.i222114 = add nuw nsw i32 %0, 0
-  %17 = load i32, i32 addrspace(1)* null, align 4294967296
+  %17 = load i32, ptr addrspace(1) null, align 4294967296
   %idx.ext.i222315 = sext i32 %0 to i64
-  %add.ptr.i2224 = getelementptr inbounds %struct.ShaderData, %struct.ShaderData addrspace(1)* %sd, i64 0, i32 30, i64 %idx.ext.i2223
+  %add.ptr.i2224 = getelementptr inbounds %struct.ShaderData, ptr addrspace(1) %sd, i64 0, i32 30, i64 %idx.ext.i2223
   %idx.ext8.i22252724 = zext i32 %0 to i64
-  %add.ptr9.i2226 = getelementptr inbounds %struct.ShaderClosure, %struct.ShaderClosure addrspace(1)* %add.ptr.i2224, i64 %idx.ext8.i22252724
-  %phi.cast2731 = bitcast %struct.ShaderClosure addrspace(1)* %add.ptr9.i2226 to %struct.MicrofacetExtra addrspace(1)*
+  %add.ptr9.i2226 = getelementptr inbounds %struct.ShaderClosure, ptr addrspace(1) %add.ptr.i2224, i64 %idx.ext8.i22252724
   br label %cond.end579
 
 cond.end579:                                      ; preds = %if.end.i2227, %cond.true576, %bsdf_alloc.exit2214
-  %cond580 = phi %struct.MicrofacetExtra addrspace(1)* [ null, %bsdf_alloc.exit2214 ], [ %phi.cast2731, %if.end.i2227 ], [ null, %cond.true576 ]
-  %tobool583 = icmp ne %struct.MicrofacetExtra addrspace(1)* %cond580, null
+  %cond580 = phi ptr addrspace(1) [ null, %bsdf_alloc.exit2214 ], [ %add.ptr9.i2226, %if.end.i2227 ], [ null, %cond.true576 ]
+  %tobool583 = icmp ne ptr addrspace(1) %cond580, null
   %or.cond1308 = select i1 %cmp442, i1 %tobool583, i1 false
   br i1 %or.cond1308, label %if.then584, label %common.ret
 
 if.then584:                                       ; preds = %cond.end579
-  store %struct.MicrofacetExtra addrspace(1)* null, %struct.MicrofacetExtra addrspace(1)* addrspace(1)* null, align 4294967296
+  store ptr addrspace(1) null, ptr addrspace(1) null, align 4294967296
   br label %if.end627.sink.split
 
 if.end627.sink.split:                             ; preds = %if.then584, %if.then534
-  store i32 0, i32 addrspace(1)* null, align 4
+  store i32 0, ptr addrspace(1) null, align 4
   br label %common.ret
 
 if.end.i.i2285:                                   ; preds = %cond.true20
-  store i32 0, i32 addrspace(1)* null, align 4294967296
+  store i32 0, ptr addrspace(1) null, align 4294967296
   br label %common.ret
 }
 
-define internal fastcc void @svm_eval_nodes(%struct.ShaderData addrspace(1)* %sd) {
+define internal fastcc void @svm_eval_nodes(ptr addrspace(1) %sd) {
 sw.bb10:
 ; GCN-LABEL: {{^}}svm_eval_nodes:
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR:v[0-9]+]], s30,
@@ -188,7 +187,7 @@ sw.bb10:
 ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]],
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN: s_setpc_b64 s[30:31]
-  call fastcc void @svm_node_closure_bsdf(%struct.ShaderData addrspace(1)* null, float* null, <4 x i32> zeroinitializer, i32* null, i32 undef, i8 undef, float undef, float undef, float undef, i1 undef, <4 x i32> undef, float undef, i32 undef, i1 undef, i1 undef, i1 undef, float undef, %struct.ShaderClosure addrspace(1)* undef, %struct.ShaderClosure addrspace(1)* undef, %struct.ShaderClosure addrspace(1)* undef, i1 undef, i32 addrspace(1)* undef, i32 undef, i1 undef, i32 undef, i64 undef, i32 undef)
+  call fastcc void @svm_node_closure_bsdf(ptr addrspace(1) null, ptr null, <4 x i32> zeroinitializer, ptr null, i32 undef, i8 undef, float undef, float undef, float undef, i1 undef, <4 x i32> undef, float undef, i32 undef, i1 undef, i1 undef, i1 undef, float undef, ptr addrspace(1) undef, ptr addrspace(1) undef, ptr addrspace(1) undef, i1 undef, ptr addrspace(1) undef, i32 undef, i1 undef, i32 undef, i64 undef, i32 undef)
   ret void
 }
 
@@ -197,7 +196,7 @@ kernel_set_buffer_pointers.exit:
 ; GCN-LABEL: {{^}}kernel_ocl_path_trace_shadow_blocked_dl:
 ; GCN: s_swappc_b64 s[30:31]
 ; GCN: endpgm
-  tail call fastcc void @svm_eval_nodes(%struct.ShaderData addrspace(1)* null)
+  tail call fastcc void @svm_eval_nodes(ptr addrspace(1) null)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 1317a89603c2..92a728ed2672 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -4,9 +4,9 @@
 ; Kernels are not called, so there is no call preserved mask.
 ; GCN-LABEL: {{^}}kernel:
 ; GCN: flat_store_dword
-define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @kernel(ptr addrspace(1) %out) #0 {
 entry:
-  store i32 0, i32 addrspace(1)* %out
+  store i32 0, ptr addrspace(1) %out
   ret void
 }
 
@@ -33,9 +33,9 @@ define hidden void @func() #1 {
 ; GCN: ; NumSgprs: 37
 ; GCN: ; NumVgprs: 9
 define amdgpu_kernel void @kernel_call() #0 {
-  %vgpr = load volatile i32, i32 addrspace(1)* undef
+  %vgpr = load volatile i32, ptr addrspace(1) undef
   tail call void @func()
-  store volatile i32 %vgpr, i32 addrspace(1)* undef
+  store volatile i32 %vgpr, ptr addrspace(1) undef
   ret void
 }
 
@@ -51,9 +51,9 @@ define amdgpu_kernel void @kernel_call() #0 {
 ; GCN: ; NumSgprs: 34
 ; GCN: ; NumVgprs: 10
 define void @func_regular_call() #1 {
-  %vgpr = load volatile i32, i32 addrspace(1)* undef
+  %vgpr = load volatile i32, ptr addrspace(1) undef
   tail call void @func()
-  store volatile i32 %vgpr, i32 addrspace(1)* undef
+  store volatile i32 %vgpr, ptr addrspace(1) undef
   ret void
 }
 
@@ -80,9 +80,9 @@ define void @func_tail_call() #1 {
 ; GCN: ; NumSgprs: 34
 ; GCN: ; NumVgprs: 10
 define void @func_call_tail_call() #1 {
-  %vgpr = load volatile i32, i32 addrspace(1)* undef
+  %vgpr = load volatile i32, ptr addrspace(1) undef
   tail call void @func()
-  store volatile i32 %vgpr, i32 addrspace(1)* undef
+  store volatile i32 %vgpr, ptr addrspace(1) undef
   tail call void @func()
   ret void
 }
@@ -106,7 +106,7 @@ define void @test_funcx2() #0 {
 }
 
 ; GCN-LABEL: {{^}}wombat:
-define weak amdgpu_kernel void @wombat(i32* %arg, i32* %arg2) {
+define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) {
 bb:
   call void @hoge() #0
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/jump-address.ll b/llvm/test/CodeGen/AMDGPU/jump-address.ll
index e134114caea2..6a8f8b9b29cd 100644
--- a/llvm/test/CodeGen/AMDGPU/jump-address.ll
+++ b/llvm/test/CodeGen/AMDGPU/jump-address.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_ps void @main() {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %0 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %1 = extractelement <4 x float> %0, i32 0
   %2 = bitcast float %1 to i32
   %3 = icmp eq i32 %2, 0
@@ -17,7 +17,7 @@ main_body:
   br i1 %7, label %ENDIF, label %ELSE
 
 ELSE:                                             ; preds = %main_body
-  %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %8 = load <4 x float>, ptr addrspace(8) getelementptr ([1024 x <4 x float>], ptr addrspace(8) null, i64 0, i32 1)
   %9 = extractelement <4 x float> %8, i32 0
   %10 = bitcast float %9 to i32
   %11 = icmp eq i32 %10, 1
@@ -40,7 +40,7 @@ ENDIF:                                            ; preds = %IF13, %ELSE, %main_
   ret void
 
 IF13:                                             ; preds = %ELSE
-  %20 = load <4 x float>, <4 x float> addrspace(8)* null
+  %20 = load <4 x float>, ptr addrspace(8) null
   %21 = extractelement <4 x float> %20, i32 0
   %22 = fsub float -0.000000e+00, %21
   %23 = fadd float 0x3FF8000000000000, %22

diff  --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index fd195f9b790e..fbe4f8ffb10b 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -20,10 +20,10 @@ declare void @llvm.trap() #0
 ; DOORBELL-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; DOORBELL:      .end_amdhsa_kernel
 
-define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
-  store volatile i32 1, i32 addrspace(1)* %arg0
+define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
+  store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.trap()
   unreachable
-  store volatile i32 2, i32 addrspace(1)* %arg0
+  store volatile i32 2, ptr addrspace(1) %arg0
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index c03cf9edc44b..5c35c107982c 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -5,7 +5,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -check-prefixes=EGCM,EG %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -check-prefixes=EGCM,CM %s
 
-define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind {
 ; SI-LABEL: i8_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -73,11 +73,11 @@ define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) noun
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %ext = zext i8 %in to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind {
 ; SI-LABEL: i8_zext_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -148,11 +148,11 @@ define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zero
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %ext = zext i8 %in to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind {
 ; SI-LABEL: i8_sext_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -223,11 +223,11 @@ define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 sign
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %ext = sext i8 %in to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind {
 ; SI-LABEL: i16_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -295,11 +295,11 @@ define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) no
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %ext = zext i16 %in to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind {
 ; SI-LABEL: i16_zext_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -370,11 +370,11 @@ define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 ze
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %ext = zext i16 %in to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind {
 ; SI-LABEL: i16_sext_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -445,11 +445,11 @@ define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 si
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %ext = sext i16 %in to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
+define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind {
 ; SI-LABEL: i32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -504,11 +504,11 @@ define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) no
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; CM-NEXT:     MOV * T1.X, KC0[2].Z,
 entry:
-  store i32 %in, i32 addrspace(1)* %out, align 4
+  store i32 %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
+define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind {
 ; SI-LABEL: f32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -563,11 +563,11 @@ define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ; CM-NEXT:     MOV * T1.X, KC0[2].Z,
 entry:
-  store float %in, float addrspace(1)* %out, align 4
+  store float %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
+define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) {
 ; SI-LABEL: v2i8_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -662,11 +662,11 @@ define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 ; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
+  store <2 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
+define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) {
 ; SI-LABEL: v2i16_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -741,11 +741,11 @@ define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in
 ; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
+  store <2 x i16> %in, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
+define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind {
 ; SI-LABEL: v2i32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -804,11 +804,11 @@ define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
+  store <2 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
+define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind {
 ; SI-LABEL: v2f32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -867,11 +867,11 @@ define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
+  store <2 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind {
 ; SI-LABEL: v3i8_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -1006,11 +1006,11 @@ define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x
 ; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
+  store <3 x i8> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
+define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
 ; SI-LABEL: v3i16_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1125,11 +1125,11 @@ define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3
 ; CM-NEXT:     LSHR * T8.X, T0.W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
+  store <3 x i16> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
+define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
 ; SI-LABEL: v3i32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -1204,11 +1204,11 @@ define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3
 ; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
+  store <3 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
+define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
 ; SI-LABEL: v3f32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -1283,11 +1283,11 @@ define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <
 ; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
+  store <3 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) {
 ; SI-LABEL: v4i8_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -1386,11 +1386,11 @@ define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 ; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+  store <4 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
+define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
 ; SI-LABEL: v4i16_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1537,11 +1537,11 @@ define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in
 ; CM-NEXT:     MOV T2.X, PV.X,
 ; CM-NEXT:     MOV * T5.Y, T3.X,
 entry:
-  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
+  store <4 x i16> %in, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
+define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind {
 ; SI-LABEL: v4i32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -1611,11 +1611,11 @@ define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
+  store <4 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
+define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind {
 ; SI-LABEL: v4f32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -1685,11 +1685,11 @@ define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
+  store <4 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind {
+define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
 ; SI-LABEL: v5i8_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1826,11 +1826,11 @@ define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x
 ; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4
+  store <5 x i8> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind {
+define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind {
 ; SI-LABEL: v5i16_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xf
@@ -2057,11 +2057,11 @@ define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5
 ; CM-NEXT:     LSHR * T9.X, T0.W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4
+  store <5 x i16> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind {
+define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind {
 ; SI-LABEL: v5i32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s8, s[0:1], 0x15
@@ -2155,11 +2155,11 @@ define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5
 ; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4
+  store <5 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind {
+define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind {
 ; SI-LABEL: v5f32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s8, s[0:1], 0x15
@@ -2254,11 +2254,11 @@ define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <
 ; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4
+  store <5 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind {
+define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind {
 ; SI-LABEL: v5i64_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x19
@@ -2396,11 +2396,11 @@ define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5
 ; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8
+  store <5 x i64> %in, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind {
+define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
 ; SI-LABEL: v5f64_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x19
@@ -2538,12 +2538,12 @@ define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out,
 ; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8
+  store <5 x double> %in, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FIXME: Lots of unpack and re-pack junk on VI
-define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
+define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
 ; SI-LABEL: v8i8_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2790,11 +2790,11 @@ define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 ; CM-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
+  store <8 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
+define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
 ; SI-LABEL: v8i16_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -3038,11 +3038,11 @@ define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in
 ; CM-NEXT:     MOV * T7.W, T3.X,
 ; CM-NEXT:     MOV * T7.Y, T5.X,
 entry:
-  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
+  store <8 x i16> %in, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
+define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
 ; SI-LABEL: v8i32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
@@ -3149,11 +3149,11 @@ define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8
 ; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
+  store <8 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
+define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind {
 ; SI-LABEL: v8f32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
@@ -3260,12 +3260,12 @@ define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <
 ; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
+  store <8 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; FIXME: Pack/repack on VI
-define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
+define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
 ; SI-LABEL: v16i8_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
@@ -3711,11 +3711,11 @@ define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in
 ; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
+  store <16 x i8> %in, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
+define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
 ; SI-LABEL: v16i16_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
@@ -4167,11 +4167,11 @@ define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16>
 ; CM-NEXT:     MOV * T11.W, T7.X, BS:VEC_120/SCL_212
 ; CM-NEXT:     MOV * T11.Y, T9.X,
 entry:
-  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
+  store <16 x i16> %in, ptr addrspace(1) %out
   ret void
 }
 
-define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
+define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind {
 ; SI-LABEL: v16i32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
@@ -4355,11 +4355,11 @@ define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <
 ; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
+  store <16 x i32> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
+define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind {
 ; SI-LABEL: v16f32_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
@@ -4543,11 +4543,11 @@ define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out,
 ; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
+  store <16 x float> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind {
 ; SI-LABEL: kernel_arg_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4605,11 +4605,11 @@ define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwi
 ; CM-NEXT:     MOV * T0.X, KC0[2].W,
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  store i64 %a, i64 addrspace(1)* %out, align 8
+  store i64 %a, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
+define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double  %in) {
 ; SI-LABEL: f64_kernel_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -4668,7 +4668,7 @@ define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store double %in, double addrspace(1)* %out
+  store double %in, ptr addrspace(1) %out
   ret void
 }
 
@@ -4676,12 +4676,12 @@ entry:
 ; XGCN: s_load_dwordx2
 ; XGCN: s_load_dwordx2
 ; XGCN: buffer_store_dwordx2
-; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
-;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
+; define amdgpu_kernel void @kernel_arg_v1i64(ptr addrspace(1) %out, <1 x i64> %a) nounwind {
+;   store <1 x i64> %a, ptr addrspace(1) %out, align 8
 ;   ret void
 ; }
 
-define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
+define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind {
 ; SI-LABEL: i65_arg:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
@@ -4795,11 +4795,11 @@ define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) no
 ; CM-NEXT:     LSHR * T5.X, T0.W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
-  store i65 %in, i65 addrspace(1)* %out, align 4
+  store i65 %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
 ; SI-LABEL: i1_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -4886,11 +4886,11 @@ define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
 ; CM-NEXT:     MOV * T0.Z, 0.0,
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  store i1 %x, i1 addrspace(1)* %out, align 1
+  store i1 %x, ptr addrspace(1) %out, align 1
   ret void
 }
 
-define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
 ; SI-LABEL: i1_arg_zext_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -4958,11 +4958,11 @@ define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwi
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %ext = zext i1 %x to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind {
 ; SI-LABEL: i1_arg_zext_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
@@ -5034,11 +5034,11 @@ define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %ext = zext i1 %x to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
+  store i64 %ext, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind {
 ; SI-LABEL: i1_arg_sext_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -5108,11 +5108,11 @@ define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwi
 ; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %ext = sext i1 %x to i32
-  store i32 %ext, i32addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind {
 ; SI-LABEL: i1_arg_sext_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -5187,7 +5187,7 @@ define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
 ; CM-NEXT:     MOV * T0.Y, PV.X,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %ext = sext i1 %x to i64
-  store i64 %ext, i64 addrspace(1)* %out, align 8
+  store i64 %ext, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -5346,10 +5346,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
   %val1 = extractvalue {i32, i64} %arg0, 1
   %val2 = extractvalue {i32, i64} %arg1, 0
   %val3 = extractvalue {i32, i64} %arg1, 1
-  store volatile i32 %val0, i32 addrspace(1)* null
-  store volatile i64 %val1, i64 addrspace(1)* null
-  store volatile i32 %val2, i32 addrspace(1)* null
-  store volatile i64 %val3, i64 addrspace(1)* null
+  store volatile i32 %val0, ptr addrspace(1) null
+  store volatile i64 %val1, ptr addrspace(1) null
+  store volatile i32 %val2, ptr addrspace(1) null
+  store volatile i64 %val3, ptr addrspace(1) null
   ret void
 }
 
@@ -5531,10 +5531,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
   %val1 = extractvalue <{i32, i64}> %arg0, 1
   %val2 = extractvalue <{i32, i64}> %arg1, 0
   %val3 = extractvalue <{i32, i64}> %arg1, 1
-  store volatile i32 %val0, i32 addrspace(1)* null
-  store volatile i64 %val1, i64 addrspace(1)* null
-  store volatile i32 %val2, i32 addrspace(1)* null
-  store volatile i64 %val3, i64 addrspace(1)* null
+  store volatile i32 %val0, ptr addrspace(1) null
+  store volatile i64 %val1, ptr addrspace(1) null
+  store volatile i32 %val2, ptr addrspace(1) null
+  store volatile i64 %val3, ptr addrspace(1) null
   ret void
 }
 
@@ -5696,11 +5696,11 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8,
   %val1 = extractvalue {i32, i64} %arg0, 1
   %val2 = extractvalue {i32, i64} %arg2, 0
   %val3 = extractvalue {i32, i64} %arg2, 1
-  store volatile i32 %val0, i32 addrspace(1)* null
-  store volatile i64 %val1, i64 addrspace(1)* null
-  store volatile i32 %val2, i32 addrspace(1)* null
-  store volatile i64 %val3, i64 addrspace(1)* null
-  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
+  store volatile i32 %val0, ptr addrspace(1) null
+  store volatile i64 %val1, ptr addrspace(1) null
+  store volatile i32 %val2, ptr addrspace(1) null
+  store volatile i64 %val3, ptr addrspace(1) null
+  store volatile <4 x i32> %arg4, ptr addrspace(1) null
   ret void
 }
 
@@ -5812,8 +5812,8 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
 ; CM-NEXT:     MOV * T3.X, KC0[3].X,
 ; CM-NEXT:     MOV * T4.X, literal.x,
 ; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
-  store volatile i16 %arg0, i16 addrspace(1)* undef
-  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
+  store volatile i16 %arg0, ptr addrspace(1) undef
+  store volatile [3 x i32] %arg1, ptr addrspace(1) undef
   ret void
 }
 
@@ -5983,8 +5983,8 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
 ; CM-NEXT:     MOV T2.Y, 0.0,
 ; CM-NEXT:     MOV * T2.Z, 0.0,
 ; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
-  store volatile i8 %arg0, i8 addrspace(1)* undef
-  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
+  store volatile i8 %arg0, ptr addrspace(1) undef
+  store volatile [3 x i16] %arg1, ptr addrspace(1) undef
   ret void
 }
 
@@ -6041,11 +6041,11 @@ define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
 ; EGCM-NEXT:     MOV * T1.X, literal.x,
 ; EGCM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
   %val = extractvalue [1 x i8] %arg, 0
-  store volatile i8 %val, i8 addrspace(1)* undef
+  store volatile i8 %val, ptr addrspace(1) undef
   ret void
 }
 
-define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) {
+define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
 ; SI-LABEL: byref_align_constant_i32_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x49
@@ -6123,13 +6123,13 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapt
 ; CM-NEXT:     MOV * T1.X, KC0[18].Z,
 ; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %in = load i32, i32 addrspace(4)* %in.byref
-  store volatile i32 %in, i32 addrspace(1)* %out, align 4
-  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
+  %in = load i32, ptr addrspace(4) %in.byref
+  store volatile i32 %in, ptr addrspace(1) %out, align 4
+  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
   ret void
 }
 
-define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) {
+define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) %in.byref, i32 %after.offset) {
 ; SI-LABEL: byref_natural_align_constant_v16i32_arg:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
@@ -6347,9 +6347,8 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace
 ; CM-NEXT:     MOV * T1.X, KC0[10].Y,
 ; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-  %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref
-  %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)*
-  store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4
-  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
+  %in = load <16 x i32>, ptr addrspace(4) %in.byref
+  store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4
+  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index 5614a9360c92..9f65d12f00e8 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -10,8 +10,8 @@
 
 ; GCN: s_load_dword s
 ; GCN: s_and_b32
-define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
-  store i1 %x, i1 addrspace(1)* %out, align 1
+define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
+  store i1 %x, ptr addrspace(1) %out, align 1
   ret void
 }
 
@@ -20,9 +20,9 @@ define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
 ; HSA-VI: kernarg_segment_alignment = 4
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
-define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind {
 entry:
-  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
+  store <3 x i8> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -30,9 +30,9 @@ entry:
 ; HSA-VI: kernarg_segment_byte_size = 24
 ; HSA-VI: kernarg_segment_alignment = 4
 ; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
-define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
+define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind {
 entry:
-  store i65 %in, i65 addrspace(1)* %out, align 4
+  store i65 %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -63,10 +63,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
   %val1 = extractvalue {i32, i64} %arg0, 1
   %val2 = extractvalue {i32, i64} %arg1, 0
   %val3 = extractvalue {i32, i64} %arg1, 1
-  store volatile i32 %val0, i32 addrspace(1)* null
-  store volatile i64 %val1, i64 addrspace(1)* null
-  store volatile i32 %val2, i32 addrspace(1)* null
-  store volatile i64 %val3, i64 addrspace(1)* null
+  store volatile i32 %val0, ptr addrspace(1) null
+  store volatile i64 %val1, ptr addrspace(1) null
+  store volatile i32 %val2, ptr addrspace(1) null
+  store volatile i64 %val3, ptr addrspace(1) null
   ret void
 }
 
@@ -84,10 +84,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
   %val1 = extractvalue <{i32, i64}> %arg0, 1
   %val2 = extractvalue <{i32, i64}> %arg1, 0
   %val3 = extractvalue <{i32, i64}> %arg1, 1
-  store volatile i32 %val0, i32 addrspace(1)* null
-  store volatile i64 %val1, i64 addrspace(1)* null
-  store volatile i32 %val2, i32 addrspace(1)* null
-  store volatile i64 %val3, i64 addrspace(1)* null
+  store volatile i32 %val0, ptr addrspace(1) null
+  store volatile i64 %val1, ptr addrspace(1) null
+  store volatile i32 %val2, ptr addrspace(1) null
+  store volatile i64 %val3, ptr addrspace(1) null
   ret void
 }
 
@@ -103,27 +103,27 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8,
   %val1 = extractvalue {i32, i64} %arg0, 1
   %val2 = extractvalue {i32, i64} %arg2, 0
   %val3 = extractvalue {i32, i64} %arg2, 1
-  store volatile i32 %val0, i32 addrspace(1)* null
-  store volatile i64 %val1, i64 addrspace(1)* null
-  store volatile i32 %val2, i32 addrspace(1)* null
-  store volatile i64 %val3, i64 addrspace(1)* null
-  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
+  store volatile i32 %val0, ptr addrspace(1) null
+  store volatile i64 %val1, ptr addrspace(1) null
+  store volatile i32 %val2, ptr addrspace(1) null
+  store volatile i64 %val3, ptr addrspace(1) null
+  store volatile <4 x i32> %arg4, ptr addrspace(1) null
   ret void
 }
 
 ; GCN-LABEL: {{^}}array_3xi32:
 ; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
 define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
-  store volatile i16 %arg0, i16 addrspace(1)* undef
-  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
+  store volatile i16 %arg0, ptr addrspace(1) undef
+  store volatile [3 x i32] %arg1, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}array_3xi16:
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
-  store volatile i8 %arg0, i8 addrspace(1)* undef
-  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
+  store volatile i8 %arg0, ptr addrspace(1) undef
+  store volatile [3 x i16] %arg1, ptr addrspace(1) undef
   ret void
 }
 
@@ -131,9 +131,9 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
 ; GCN: s_load_dword [[DWORD:s[0-9]+]]
 ; GCN-DAG: s_bfe_u32 [[BFE:s[0-9]+]], [[DWORD]], 0x100010{{$}}
 ; GCN-DAG: s_and_b32 [[AND:s[0-9]+]], [[DWORD]], 0x7fff{{$}}
-define amdgpu_kernel void @v2i15_arg(<2 x i15> addrspace(1)* nocapture %out, <2 x i15> %in) {
+define amdgpu_kernel void @v2i15_arg(ptr addrspace(1) nocapture %out, <2 x i15> %in) {
 entry:
-  store <2 x i15> %in, <2 x i15> addrspace(1)* %out, align 4
+  store <2 x i15> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -143,9 +143,9 @@ entry:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: s_or_b32
-define amdgpu_kernel void @v3i15_arg(<3 x i15> addrspace(1)* nocapture %out, <3 x i15> %in) {
+define amdgpu_kernel void @v3i15_arg(ptr addrspace(1) nocapture %out, <3 x i15> %in) {
 entry:
-  store <3 x i15> %in, <3 x i15> addrspace(1)* %out, align 4
+  store <3 x i15> %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -154,10 +154,10 @@ entry:
 ; GCN: kernarg_segment_byte_size = 12
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8
-define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) {
-  %in = load i8, i8 addrspace(4)* %in.byref
+define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) {
+  %in = load i8, ptr addrspace(4) %in.byref
   %ext = zext i8 %in to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -165,20 +165,20 @@ define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %ou
 ; GCN: kernarg_segment_byte_size = 12
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8
-define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) {
-  %in = load i16, i16 addrspace(4)* %in.byref
+define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) {
+  %in = load i16, ptr addrspace(4) %in.byref
   %ext = zext i16 %in to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}byref_constant_i32_arg:
 ; GCN: kernarg_segment_byte_size = 16
 ; GCN: s_load_dwordx4 [[LOAD:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
-define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in.byref, i32 %after.offset) {
-  %in = load i32, i32 addrspace(4)* %in.byref
-  store volatile i32 %in, i32 addrspace(1)* %out, align 4
-  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) {
+  %in = load i32, ptr addrspace(4) %in.byref
+  store volatile i32 %in, ptr addrspace(1) %out, align 4
+  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -186,11 +186,10 @@ define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %o
 ; GCN: kernarg_segment_byte_size = 36
 ; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10{{$}}
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x20{{$}}
-define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) %in.byref, i32 %after.offset) {
-  %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byref
-  store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
-  %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)*
-  store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4
+define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) {
+  %in = load <4 x i32>, ptr addrspace(4) %in.byref
+  store volatile <4 x i32> %in, ptr addrspace(1) %out, align 4
+  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -201,10 +200,10 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* noca
 ; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], s[[AFTER_OFFSET]]
 ; GCN: global_store_dword v{{[0-9]+}}, [[V_IN]], s
 ; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s
-define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) {
-  %in = load i32, i32 addrspace(4)* %in.byref
-  store volatile i32 %in, i32 addrspace(1)* %out, align 4
-  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
+  %in = load i32, ptr addrspace(4) %in.byref
+  store volatile i32 %in, ptr addrspace(1) %out, align 4
+  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -212,11 +211,10 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapt
 ; GCN: kernarg_segment_byte_size = 132
 ; GCN-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80
 ; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}}
-define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) {
-  %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref
-  %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)*
-  store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4
-  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) {
+  %in = load <16 x i32>, ptr addrspace(4) %in.byref
+  store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4
+  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -224,17 +222,17 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace
 ; GCN-LABEL: {{^}}byref_global_i32_arg:
 ; GCN: kernarg_segment_byte_size = 12
 ; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}}
-define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) %in.byref) {
-  %in = load i32, i32 addrspace(1)* %in.byref
-  store i32 %in, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) {
+  %in = load i32, ptr addrspace(1) %in.byref
+  store i32 %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}byref_flat_i32_arg:
 ; GCN: flat_load_dword [[IN:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} offset:8{{$}}
-define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) %in.byref) {
-  %in = load i32, i32* %in.byref
-  store i32 %in, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) {
+  %in = load i32, ptr %in.byref
+  store i32 %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -242,27 +240,27 @@ define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out,
 ; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s4, 8
 ; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}}
 ; GCN: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
-define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) %in.byref) {
-  %in = load i32, i32 addrspace(6)* %in.byref
-  store i32 %in, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) {
+  %in = load i32, ptr addrspace(6) %in.byref
+  store i32 %in, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref %in.byref) {
-;   %in = load i32, i32 addrspace(999)* %in.byref
-;   store i32 %in, i32 addrspace(1)* %out, align 4
+; define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(999) byref %in.byref) {
+;   %in = load i32, ptr addrspace(999) %in.byref
+;   store i32 %in, ptr addrspace(1) %out, align 4
 ;   ret void
 ; }
 
 ; GCN-LABEL: {{^}}multi_byref_constant_i32_arg:
 ; GCN: kernarg_segment_byte_size = 20
 ; GCN: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
-define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in0.byref, i32 addrspace(4)* byref(i32) %in1.byref, i32 %after.offset) {
-  %in0 = load i32, i32 addrspace(4)* %in0.byref
-  %in1 = load i32, i32 addrspace(4)* %in1.byref
-  store volatile i32 %in0, i32 addrspace(1)* %out, align 4
-  store volatile i32 %in1, i32 addrspace(1)* %out, align 4
-  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) {
+  %in0 = load i32, ptr addrspace(4) %in0.byref
+  %in1 = load i32, ptr addrspace(4) %in1.byref
+  store volatile i32 %in0, ptr addrspace(1) %out, align 4
+  store volatile i32 %in1, ptr addrspace(1) %out, align 4
+  store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -271,8 +269,8 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapt
 ; GCN-NOT: s4
 ; GCN-NOT: s5
 ; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x0{{$}}
-define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) %in.byref) {
-  %in = load i32, i32 addrspace(4)* %in.byref
-  store i32 %in, i32 addrspace(1)* undef, align 4
+define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) {
+  %in = load i32, ptr addrspace(4) %in.byref
+  store i32 %in, ptr addrspace(1) undef, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/knownbits-recursion.ll b/llvm/test/CodeGen/AMDGPU/knownbits-recursion.ll
index 9a193031ccdb..910911a50157 100644
--- a/llvm/test/CodeGen/AMDGPU/knownbits-recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/knownbits-recursion.ll
@@ -8,14 +8,14 @@
 ; node produced.
 
 ; GCN: v_mul_u32_u24
-define amdgpu_kernel void @test(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @test(ptr addrspace(1) nocapture %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb4
 
 bb1:                                              ; preds = %bb4
-  %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp46
-  store i32 %tmp46, i32 addrspace(1)* %tmp3, align 4
+  %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %tmp46
+  store i32 %tmp46, ptr addrspace(1) %tmp3, align 4
   ret void
 
 bb4:                                              ; preds = %bb4, %bb

diff  --git a/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll b/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll
index 4b6d1d63ecba..ff91860225e2 100644
--- a/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll
@@ -4,8 +4,8 @@
 
 @gv = external unnamed_addr addrspace(4) constant [239 x i32], align 4
 
-define amdgpu_kernel void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
-  %val = load i32, i32 addrspace(4)* getelementptr ([239 x i32], [239 x i32] addrspace(4)* @gv, i64 0, i64 239), align 4
+define amdgpu_kernel void @opencv_cvtfloat_crash(ptr addrspace(1) %out, i32 %x) nounwind {
+  %val = load i32, ptr addrspace(4) getelementptr ([239 x i32], ptr addrspace(4) @gv, i64 0, i64 239), align 4
   %mul12 = mul nsw i32 %val, 7
   br i1 undef, label %exit, label %bb
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lcssa-optnone.ll b/llvm/test/CodeGen/AMDGPU/lcssa-optnone.ll
index bda1412cbebc..4932b80e4e82 100644
--- a/llvm/test/CodeGen/AMDGPU/lcssa-optnone.ll
+++ b/llvm/test/CodeGen/AMDGPU/lcssa-optnone.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: non_uniform_loop
 ; CHECK: s_endpgm
-define amdgpu_kernel void @non_uniform_loop(float addrspace(1)* %array) {
+define amdgpu_kernel void @non_uniform_loop(ptr addrspace(1) %array) {
 entry:
   %w = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %for.cond

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
index 669988d3878d..727899eb5387 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -5,22 +5,22 @@
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range:
 ; CHECK-NOT: v0
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
-define amdgpu_kernel void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_max_range(ptr addrspace(1) nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
   %and = and i32 %id, 1023
-  store i32 %and, i32 addrspace(1)* %out, align 4
+  store i32 %and, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range:
 ; CHECK-NOT: v_and_b32
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
-define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(ptr addrspace(1) nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
   %and = and i32 %id, 511
-  store i32 %and, i32 addrspace(1)* %out, align 4
+  store i32 %and, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -28,11 +28,11 @@ entry:
 ; CHECK-NOT: v0
 ; CHECK-NOT: v_and_b32
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
-define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(ptr addrspace(1) nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1
   %and = and i32 %id, 255
-  store i32 %and, i32 addrspace(1)* %out, align 4
+  store i32 %and, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
index 08dea66b93c3..2348ebc9a5f9 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
@@ -33,20 +33,20 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
-define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
 
-  %a = load volatile float, float addrspace(1)* %gep.0
-  %b = load volatile float, float addrspace(1)* %gep.1
-  %c = load volatile float, float addrspace(1)* %gep.2
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
 
   %mul = fmul float %a, %b
   %fma = fadd float %mul, %c
-  store float %fma, float addrspace(1)* %gep.out
+  store float %fma, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -72,26 +72,26 @@ define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out
 ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
-
-  %a = load volatile float, float addrspace(1)* %gep.0
-  %b = load volatile float, float addrspace(1)* %gep.1
-  %c = load volatile float, float addrspace(1)* %gep.2
-  %d = load volatile float, float addrspace(1)* %gep.3
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
+  %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
+  %d = load volatile float, ptr addrspace(1) %gep.3
 
   %mul = fmul float %a, %b
   %fma0 = fadd float %mul, %c
   %fma1 = fadd float %mul, %d
 
-  store volatile float %fma0, float addrspace(1)* %gep.out.0
-  store volatile float %fma1, float addrspace(1)* %gep.out.1
+  store volatile float %fma0, ptr addrspace(1) %gep.out.0
+  store volatile float %fma1, ptr addrspace(1) %gep.out.1
   ret void
 }
 
@@ -109,20 +109,20 @@ define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
-define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
 
-  %a = load volatile float, float addrspace(1)* %gep.0
-  %b = load volatile float, float addrspace(1)* %gep.1
-  %c = load volatile float, float addrspace(1)* %gep.2
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
 
   %mul = fmul float %a, %b
   %fma = fadd float %c, %mul
-  store float %fma, float addrspace(1)* %gep.out
+  store float %fma, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -139,20 +139,20 @@ define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out
 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
 
-  %a = load volatile float, float addrspace(1)* %gep.0
-  %b = load volatile float, float addrspace(1)* %gep.1
-  %c = load volatile float, float addrspace(1)* %gep.2
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
 
   %mul = fmul float %a, %b
   %fma = fsub float %mul, %c
-  store float %fma, float addrspace(1)* %gep.out
+  store float %fma, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -176,25 +176,25 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
-
-  %a = load volatile float, float addrspace(1)* %gep.0
-  %b = load volatile float, float addrspace(1)* %gep.1
-  %c = load volatile float, float addrspace(1)* %gep.2
-  %d = load volatile float, float addrspace(1)* %gep.3
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
+  %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
+  %d = load volatile float, ptr addrspace(1) %gep.3
 
   %mul = fmul float %a, %b
   %fma0 = fsub float %mul, %c
   %fma1 = fsub float %mul, %d
-  store volatile float %fma0, float addrspace(1)* %gep.out.0
-  store volatile float %fma1, float addrspace(1)* %gep.out.1
+  store volatile float %fma0, ptr addrspace(1) %gep.out.0
+  store volatile float %fma1, ptr addrspace(1) %gep.out.1
   ret void
 }
 
@@ -211,20 +211,20 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* no
 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_1_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
 
-  %a = load volatile float, float addrspace(1)* %gep.0
-  %b = load volatile float, float addrspace(1)* %gep.1
-  %c = load volatile float, float addrspace(1)* %gep.2
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
 
   %mul = fmul float %a, %b
   %fma = fsub float %c, %mul
-  store float %fma, float addrspace(1)* %gep.out
+  store float %fma, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -248,25 +248,25 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
-
-  %a = load volatile float, float addrspace(1)* %gep.0
-  %b = load volatile float, float addrspace(1)* %gep.1
-  %c = load volatile float, float addrspace(1)* %gep.2
-  %d = load volatile float, float addrspace(1)* %gep.3
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
+  %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
+  %d = load volatile float, ptr addrspace(1) %gep.3
 
   %mul = fmul float %a, %b
   %fma0 = fsub float %c, %mul
   %fma1 = fsub float %d, %mul
-  store volatile float %fma0, float addrspace(1)* %gep.out.0
-  store volatile float %fma1, float addrspace(1)* %gep.out.1
+  store volatile float %fma0, ptr addrspace(1) %gep.out.0
+  store volatile float %fma1, ptr addrspace(1) %gep.out.1
   ret void
 }
 
@@ -284,22 +284,22 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* no
 ; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
 
-  %a = load volatile float, float addrspace(1)* %gep.0
-  %b = load volatile float, float addrspace(1)* %gep.1
-  %c = load volatile float, float addrspace(1)* %gep.2
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
 
   %mul = fmul float %a, %b
   %mul.neg = fneg float %mul
   %fma = fsub float %mul.neg, %c
 
-  store float %fma, float addrspace(1)* %gep.out
+  store float %fma, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -323,27 +323,27 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
-
-  %a = load volatile float, float addrspace(1)* %gep.0
-  %b = load volatile float, float addrspace(1)* %gep.1
-  %c = load volatile float, float addrspace(1)* %gep.2
-  %d = load volatile float, float addrspace(1)* %gep.3
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
+  %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
+  %d = load volatile float, ptr addrspace(1) %gep.3
 
   %mul = fmul float %a, %b
   %mul.neg = fneg float %mul
   %fma0 = fsub float %mul.neg, %c
   %fma1 = fsub float %mul.neg, %d
 
-  store volatile float %fma0, float addrspace(1)* %gep.out.0
-  store volatile float %fma1, float addrspace(1)* %gep.out.1
+  store volatile float %fma0, ptr addrspace(1) %gep.out.0
+  store volatile float %fma1, ptr addrspace(1) %gep.out.1
   ret void
 }
 
@@ -367,27 +367,27 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
-
-  %a = load volatile float, float addrspace(1)* %gep.0
-  %b = load volatile float, float addrspace(1)* %gep.1
-  %c = load volatile float, float addrspace(1)* %gep.2
-  %d = load volatile float, float addrspace(1)* %gep.3
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
+  %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
+  %d = load volatile float, ptr addrspace(1) %gep.3
 
   %mul = fmul float %a, %b
   %mul.neg = fneg float %mul
   %fma0 = fsub float %mul.neg, %c
   %fma1 = fsub float %mul, %d
 
-  store volatile float %fma0, float addrspace(1)* %gep.out.0
-  store volatile float %fma1, float addrspace(1)* %gep.out.1
+  store volatile float %fma0, ptr addrspace(1) %gep.out.0
+  store volatile float %fma1, ptr addrspace(1) %gep.out.1
   ret void
 }
 
@@ -412,26 +412,26 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1
 ; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %x = load volatile float, float addrspace(1)* %gep.0
-  %y = load volatile float, float addrspace(1)* %gep.1
-  %z = load volatile float, float addrspace(1)* %gep.2
-  %u = load volatile float, float addrspace(1)* %gep.3
-  %v = load volatile float, float addrspace(1)* %gep.4
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
+  %gep.4 = getelementptr float, ptr addrspace(1) %gep.0, i32 4
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
+
+  %x = load volatile float, ptr addrspace(1) %gep.0
+  %y = load volatile float, ptr addrspace(1) %gep.1
+  %z = load volatile float, ptr addrspace(1) %gep.2
+  %u = load volatile float, ptr addrspace(1) %gep.3
+  %v = load volatile float, ptr addrspace(1) %gep.4
 
   %tmp0 = fmul float %u, %v
   %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
   %tmp2 = fsub float %tmp1, %z
 
-  store float %tmp2, float addrspace(1)* %gep.out
+  store float %tmp2, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -455,26 +455,26 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %x = load volatile float, float addrspace(1)* %gep.0
-  %y = load volatile float, float addrspace(1)* %gep.1
-  %z = load volatile float, float addrspace(1)* %gep.2
-  %u = load volatile float, float addrspace(1)* %gep.3
-  %v = load volatile float, float addrspace(1)* %gep.4
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
+  %gep.4 = getelementptr float, ptr addrspace(1) %gep.0, i32 4
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
+
+  %x = load volatile float, ptr addrspace(1) %gep.0
+  %y = load volatile float, ptr addrspace(1) %gep.1
+  %z = load volatile float, ptr addrspace(1) %gep.2
+  %u = load volatile float, ptr addrspace(1) %gep.3
+  %v = load volatile float, ptr addrspace(1) %gep.4
 
   %tmp0 = fmul float %u, %v
   %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
   %tmp2 = fsub float %x, %tmp1
 
-  store float %tmp2, float addrspace(1)* %gep.out
+  store float %tmp2, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -505,26 +505,26 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %x = load volatile float, float addrspace(1)* %gep.0
-  %y = load volatile float, float addrspace(1)* %gep.1
-  %z = load volatile float, float addrspace(1)* %gep.2
-  %u = load volatile float, float addrspace(1)* %gep.3
-  %v = load volatile float, float addrspace(1)* %gep.4
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
+  %gep.4 = getelementptr float, ptr addrspace(1) %gep.0, i32 4
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
+
+  %x = load volatile float, ptr addrspace(1) %gep.0
+  %y = load volatile float, ptr addrspace(1) %gep.1
+  %z = load volatile float, ptr addrspace(1) %gep.2
+  %u = load volatile float, ptr addrspace(1) %gep.3
+  %v = load volatile float, ptr addrspace(1) %gep.4
 
   %tmp0 = fmul float %u, %v
   %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
   %tmp2 = fsub float %tmp1, %z
 
-  store float %tmp2, float addrspace(1)* %gep.out
+  store float %tmp2, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -556,27 +556,27 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
-  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
-
-  %x = load volatile float, float addrspace(1)* %gep.0
-  %y = load volatile float, float addrspace(1)* %gep.1
-  %z = load volatile float, float addrspace(1)* %gep.2
-  %u = load volatile float, float addrspace(1)* %gep.3
-  %v = load volatile float, float addrspace(1)* %gep.4
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
+  %gep.4 = getelementptr float, ptr addrspace(1) %gep.0, i32 4
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
+
+  %x = load volatile float, ptr addrspace(1) %gep.0
+  %y = load volatile float, ptr addrspace(1) %gep.1
+  %z = load volatile float, ptr addrspace(1) %gep.2
+  %u = load volatile float, ptr addrspace(1) %gep.3
+  %v = load volatile float, ptr addrspace(1) %gep.4
 
   ; nsz flag is needed since this combine may change sign of zero
   %tmp0 = fmul nsz float %u, %v
   %tmp1 = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
   %tmp2 = fsub nsz float %x, %tmp1
 
-  store float %tmp2, float addrspace(1)* %gep.out
+  store float %tmp2, ptr addrspace(1) %gep.out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll b/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll
index f733ed632662..2e015a93231b 100644
--- a/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll
@@ -12,7 +12,7 @@ entry:
   br label %if.end
 
 if.end:                                           ; preds = %entry
-  %0 = load i32, i32* undef, align 4
+  %0 = load i32, ptr undef, align 4
   %mul = mul i32 %0, 3
   %cmp13 = icmp eq i32 %mul, 989619
   br i1 %cmp13, label %cleanup.cont, label %if.end15
@@ -33,11 +33,9 @@ if.end60.loopexit857:                             ; preds = %while.cond.i
   br label %if.end60
 
 if.end60:                                         ; preds = %if.end60.loopexit857, %while.cond.i
-  %1 = load i8, i8 addrspace(1)* getelementptr inbounds ([4096 x i8], [4096 x i8] addrspace(1)* @_RSENC_gDcd_______________________________, i64 0, i64 655), align 1
-  %2 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %runtimeVersionCopy, i32 0, i32 0
-  %arrayidx144260.5 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %runtimeVersionCopy, i32 0, i32 5
-  %3 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %licenseVersionCopy, i32 0, i32 0
-  %arrayidx156258.5 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %licenseVersionCopy, i32 0, i32 5
+  %1 = load i8, ptr addrspace(1) getelementptr inbounds ([4096 x i8], ptr addrspace(1) @_RSENC_gDcd_______________________________, i64 0, i64 655), align 1
+  %arrayidx144260.5 = getelementptr inbounds [128 x i8], ptr addrspace(5) %runtimeVersionCopy, i32 0, i32 5
+  %arrayidx156258.5 = getelementptr inbounds [128 x i8], ptr addrspace(5) %licenseVersionCopy, i32 0, i32 5
   switch i8 0, label %if.end5.i [
     i8 45, label %if.then.i
     i8 43, label %if.then3.i
@@ -50,9 +48,9 @@ if.then3.i:                                       ; preds = %if.end60
   br label %if.end5.i
 
 if.end5.i:                                        ; preds = %if.then3.i, %if.end60
-  %pS.addr.0.i = phi i8 addrspace(5)* [ undef, %if.then3.i ], [ %2, %if.end60 ]
-  %4 = load i8, i8 addrspace(5)* %pS.addr.0.i, align 1
-  %conv612.i = sext i8 %4 to i32
+  %pS.addr.0.i = phi ptr addrspace(5) [ undef, %if.then3.i ], [ %runtimeVersionCopy, %if.end60 ]
+  %2 = load i8, ptr addrspace(5) %pS.addr.0.i, align 1
+  %conv612.i = sext i8 %2 to i32
   %sub13.i = add nsw i32 %conv612.i, -48
   %cmp714.i = icmp ugt i32 %sub13.i, 9
   switch i8 undef, label %if.end5.i314 [
@@ -67,9 +65,9 @@ if.then3.i308:                                    ; preds = %if.end5.i
   br label %if.end5.i314
 
 if.end5.i314:                                     ; preds = %if.then3.i308, %if.end5.i
-  %pS.addr.0.i309 = phi i8 addrspace(5)* [ undef, %if.then3.i308 ], [ %3, %if.end5.i ]
-  %5 = load i8, i8 addrspace(5)* %pS.addr.0.i309, align 1
-  %conv612.i311 = sext i8 %5 to i32
+  %pS.addr.0.i309 = phi ptr addrspace(5) [ undef, %if.then3.i308 ], [ %licenseVersionCopy, %if.end5.i ]
+  %3 = load i8, ptr addrspace(5) %pS.addr.0.i309, align 1
+  %conv612.i311 = sext i8 %3 to i32
   %sub13.i312 = add nsw i32 %conv612.i311, -48
   %cmp714.i313 = icmp ugt i32 %sub13.i312, 9
   switch i8 undef, label %if.end5.i338 [
@@ -84,9 +82,9 @@ if.then3.i332:                                    ; preds = %if.end5.i314
   br label %if.end5.i338
 
 if.end5.i338:                                     ; preds = %if.then3.i332, %if.end5.i314
-  %pS.addr.0.i333 = phi i8 addrspace(5)* [ undef, %if.then3.i332 ], [ %arrayidx144260.5, %if.end5.i314 ]
-  %6 = load i8, i8 addrspace(5)* %pS.addr.0.i333, align 1
-  %conv612.i335 = sext i8 %6 to i32
+  %pS.addr.0.i333 = phi ptr addrspace(5) [ undef, %if.then3.i332 ], [ %arrayidx144260.5, %if.end5.i314 ]
+  %4 = load i8, ptr addrspace(5) %pS.addr.0.i333, align 1
+  %conv612.i335 = sext i8 %4 to i32
   %sub13.i336 = add nsw i32 %conv612.i335, -48
   %cmp714.i337 = icmp ugt i32 %sub13.i336, 9
   switch i8 undef, label %if.end5.i362 [
@@ -101,15 +99,15 @@ if.then3.i356:                                    ; preds = %if.end5.i338
   br label %if.end5.i362
 
 if.end5.i362:                                     ; preds = %if.then3.i356, %if.end5.i338
-  %pS.addr.0.i357 = phi i8 addrspace(5)* [ undef, %if.then3.i356 ], [ %arrayidx156258.5, %if.end5.i338 ]
-  %7 = load i8, i8 addrspace(5)* %pS.addr.0.i357, align 1
-  %conv612.i359 = sext i8 %7 to i32
+  %pS.addr.0.i357 = phi ptr addrspace(5) [ undef, %if.then3.i356 ], [ %arrayidx156258.5, %if.end5.i338 ]
+  %5 = load i8, ptr addrspace(5) %pS.addr.0.i357, align 1
+  %conv612.i359 = sext i8 %5 to i32
   %sub13.i360 = add nsw i32 %conv612.i359, -48
   %cmp714.i361 = icmp ugt i32 %sub13.i360, 9
-  store i8 0, i8 addrspace(5)* undef, align 16
-  %8 = load i8, i8 addrspace(1)* getelementptr inbounds ([4096 x i8], [4096 x i8] addrspace(1)* @_RSENC_gDcd_______________________________, i64 0, i64 1153), align 1
-  %arrayidx232250.1 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %pD10, i32 0, i32 1
-  store i8 %8, i8 addrspace(5)* %arrayidx232250.1, align 1
+  store i8 0, ptr addrspace(5) undef, align 16
+  %6 = load i8, ptr addrspace(1) getelementptr inbounds ([4096 x i8], ptr addrspace(1) @_RSENC_gDcd_______________________________, i64 0, i64 1153), align 1
+  %arrayidx232250.1 = getelementptr inbounds [128 x i8], ptr addrspace(5) %pD10, i32 0, i32 1
+  store i8 %6, ptr addrspace(5) %arrayidx232250.1, align 1
   switch i8 undef, label %if.end5.i400 [
     i8 45, label %if.then.i392
     i8 43, label %if.then3.i394
@@ -122,13 +120,13 @@ if.then3.i394:                                    ; preds = %if.end5.i362
   br label %if.end5.i400
 
 if.end5.i400:                                     ; preds = %if.then3.i394, %if.end5.i362
-  %pS.addr.0.i395 = phi i8 addrspace(5)* [ %arrayidx232250.1, %if.then3.i394 ], [ undef, %if.end5.i362 ]
-  %9 = load i8, i8 addrspace(5)* %pS.addr.0.i395, align 1
-  %conv612.i397 = sext i8 %9 to i32
+  %pS.addr.0.i395 = phi ptr addrspace(5) [ %arrayidx232250.1, %if.then3.i394 ], [ undef, %if.end5.i362 ]
+  %7 = load i8, ptr addrspace(5) %pS.addr.0.i395, align 1
+  %conv612.i397 = sext i8 %7 to i32
   %sub13.i398 = add nsw i32 %conv612.i397, -48
   %cmp714.i399 = icmp ugt i32 %sub13.i398, 9
-  %10 = load i8, i8* undef, align 1
-  %cmp9.not.i500 = icmp eq i8 0, %10
+  %8 = load i8, ptr undef, align 1
+  %cmp9.not.i500 = icmp eq i8 0, %8
   br label %land.lhs.true402.critedge
 
 land.lhs.true402.critedge:                        ; preds = %if.end5.i400

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.ll b/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.ll
index 927868c64225..e7855a1137a4 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.ll
@@ -12,17 +12,16 @@
 ; GCN-COUNT3: ds_write_b64
 define amdgpu_kernel void @out_of_order_merge() {
 entry:
-  %gep1 = getelementptr inbounds [96 x double], [96 x double] addrspace(3)* @Ldisp, i32 0, i32 0
-  %gep2 = getelementptr inbounds [96 x double], [96 x double] addrspace(3)* @Ldisp, i32 0, i32 1
-  %tmp12 = load <2 x double>, <2 x double> addrspace(3)* bitcast (double addrspace(3)* getelementptr inbounds ([9 x double], [9 x double] addrspace(3)* @L, i32 0, i32 1) to <2 x double> addrspace(3)*), align 8
+  %gep2 = getelementptr inbounds [96 x double], ptr addrspace(3) @Ldisp, i32 0, i32 1
+  %tmp12 = load <2 x double>, ptr addrspace(3) getelementptr inbounds ([9 x double], ptr addrspace(3) @L, i32 0, i32 1), align 8
   %tmp14 = extractelement <2 x double> %tmp12, i32 0
   %tmp15 = extractelement <2 x double> %tmp12, i32 1
   %add50.i = fadd double %tmp14, %tmp15
-  store double %add50.i, double addrspace(3)* %gep1, align 8
-  %tmp16 = load double, double addrspace(3)* getelementptr inbounds ([9 x double], [9 x double] addrspace(3)* @L, i32 1, i32 0), align 8
-  store double %tmp16, double addrspace(3)* %gep2, align 8
-  %tmp17 = load <2 x double>, <2 x double> addrspace(3)* bitcast (double addrspace(3)* getelementptr inbounds ([9 x double], [9 x double] addrspace(3)* @L, i32 2, i32 1) to <2 x double> addrspace(3)*), align 8
+  store double %add50.i, ptr addrspace(3) @Ldisp, align 8
+  %tmp16 = load double, ptr addrspace(3) getelementptr inbounds ([9 x double], ptr addrspace(3) @L, i32 1, i32 0), align 8
+  store double %tmp16, ptr addrspace(3) %gep2, align 8
+  %tmp17 = load <2 x double>, ptr addrspace(3) getelementptr inbounds ([9 x double], ptr addrspace(3) @L, i32 2, i32 1), align 8
   %tmp19 = extractelement <2 x double> %tmp17, i32 1
-  store double %tmp19, double addrspace(3)* undef, align 8
+  store double %tmp19, ptr addrspace(3) undef, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
index 73f4f9e0cfc0..bb84d107bfe1 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll
@@ -13,15 +13,14 @@
 ; CHECK: tbuffer_store_format_xyzw v[0:3],
 define amdgpu_vs void @main(i32 inreg %arg) {
 main_body:
-  %tmp = load float, float addrspace(3)* undef, align 4
-  %tmp1 = load float, float addrspace(3)* undef, align 4
-  store float %tmp, float addrspace(3)* null, align 4
+  %tmp = load float, ptr addrspace(3) undef, align 4
+  %tmp1 = load float, ptr addrspace(3) undef, align 4
+  store float %tmp, ptr addrspace(3) null, align 4
   %tmp2 = bitcast float %tmp to i32
   %tmp3 = add nuw nsw i32 0, 1
   %tmp4 = zext i32 %tmp3 to i64
-  %tmp5 = getelementptr [8192 x i32], [8192 x i32] addrspace(3)* @tess_lds, i64 0, i64 %tmp4
-  %tmp6 = bitcast i32 addrspace(3)* %tmp5 to float addrspace(3)*
-  store float %tmp1, float addrspace(3)* %tmp6, align 4
+  %tmp5 = getelementptr [8192 x i32], ptr addrspace(3) @tess_lds, i64 0, i64 %tmp4
+  store float %tmp1, ptr addrspace(3) %tmp5, align 4
   %tmp7 = bitcast float %tmp1 to i32
   %tmp8 = insertelement <4 x i32> undef, i32 %tmp2, i32 0
   %tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp7, i32 1

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
index e363d039548b..bc0784be8a81 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-store-usedef.ll
@@ -6,15 +6,14 @@
 ; CHECK: ds_read_b32
 ; CHECK: ds_write_b32
 define amdgpu_vs void @test1(i32 %v) #0 {
-  %p0 = getelementptr i32, i32 addrspace(3)* null, i32 0
-  %p1 = getelementptr i32, i32 addrspace(3)* null, i32 1
+  %p1 = getelementptr i32, ptr addrspace(3) null, i32 1
 
-  store i32 %v, i32 addrspace(3)* %p0
+  store i32 %v, ptr addrspace(3) null
 
   call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 68, i32 1)
 
-  %w = load i32, i32 addrspace(3)* %p0
-  store i32 %w, i32 addrspace(3)* %p1
+  %w = load i32, ptr addrspace(3) null
+  store i32 %w, ptr addrspace(3) %p1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
index 0df7feb6516c..88ab66fdb0a0 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
@@ -12,11 +12,11 @@
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+define amdgpu_kernel void @merge_global_store_2_constants_i8(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
 
-  store i8 123, i8 addrspace(1)* %out.gep.1
-  store i8 456, i8 addrspace(1)* %out, align 2
+  store i8 123, ptr addrspace(1) %out.gep.1
+  store i8 456, ptr addrspace(1) %out, align 2
   ret void
 }
 
@@ -24,31 +24,31 @@ define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %o
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i32 1
 
-  store i8 123, i8 addrspace(1)* %out.gep.1
-  store i8 456, i8 addrspace(1)* %out
+  store i8 123, ptr addrspace(1) %out.gep.1
+  store i8 456, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
 ; GCN: buffer_store_dword v
-define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+define amdgpu_kernel void @merge_global_store_2_constants_i16(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
 
-  store i16 123, i16 addrspace(1)* %out.gep.1
-  store i16 456, i16 addrspace(1)* %out, align 4
+  store i16 123, ptr addrspace(1) %out.gep.1
+  store i16 456, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
 ; GCN: buffer_store_dword v
-define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
 
-  store i16 0, i16 addrspace(1)* %out.gep.1
-  store i16 0, i16 addrspace(1)* %out, align 4
+  store i16 0, ptr addrspace(1) %out.gep.1
+  store i16 0, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -56,11 +56,11 @@ define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
 
-  store i16 123, i16 addrspace(1)* %out.gep.1
-  store i16 456, i16 addrspace(1)* %out
+  store i16 123, ptr addrspace(1) %out.gep.1
+  store i16 456, ptr addrspace(1) %out
   ret void
 }
 
@@ -68,21 +68,20 @@ define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+define amdgpu_kernel void @merge_global_store_2_constants_i32(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
 
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out
+  store i32 123, ptr addrspace(1) %out.gep.1
+  store i32 456, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
-  store float 1.0, float addrspace(1)* %out.gep.1.bc
-  store i32 456, i32 addrspace(1)* %out
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  store float 1.0, ptr addrspace(1) %out.gep.1
+  store i32 456, ptr addrspace(1) %out
   ret void
 }
 
@@ -90,11 +89,10 @@ define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
-define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
-  store i32 123, i32 addrspace(1)* %out.gep.1.bc
-  store float 4.0, float addrspace(1)* %out
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  store i32 123, ptr addrspace(1) %out.gep.1
+  store float 4.0, ptr addrspace(1) %out
   ret void
 }
 
@@ -104,62 +102,60 @@ define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspac
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
 ; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI]]]
-define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
+define amdgpu_kernel void @merge_global_store_4_constants_i32(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
 
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out.gep.2
-  store i32 333, i32 addrspace(1)* %out.gep.3
-  store i32 1234, i32 addrspace(1)* %out
+  store i32 123, ptr addrspace(1) %out.gep.1
+  store i32 456, ptr addrspace(1) %out.gep.2
+  store i32 333, ptr addrspace(1) %out.gep.3
+  store i32 1234, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
 
-  store float 8.0, float addrspace(1)* %out
-  store float 1.0, float addrspace(1)* %out.gep.1
-  store float 2.0, float addrspace(1)* %out.gep.2
-  store float 4.0, float addrspace(1)* %out.gep.3
+  store float 8.0, ptr addrspace(1) %out
+  store float 1.0, ptr addrspace(1) %out.gep.1
+  store float 2.0, ptr addrspace(1) %out.gep.2
+  store float 4.0, ptr addrspace(1) %out.gep.3
   ret void
 }
 
 ; First store is out of order.
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+define amdgpu_kernel void @merge_global_store_4_constants_f32(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
 
-  store float 1.0, float addrspace(1)* %out.gep.1
-  store float 2.0, float addrspace(1)* %out.gep.2
-  store float 4.0, float addrspace(1)* %out.gep.3
-  store float 8.0, float addrspace(1)* %out
+  store float 1.0, ptr addrspace(1) %out.gep.1
+  store float 2.0, ptr addrspace(1) %out.gep.2
+  store float 4.0, ptr addrspace(1) %out.gep.3
+  store float 8.0, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
 ; GCN-AA: buffer_store_dwordx4 v
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
 
-  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
-  %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
 
-  store i32 11, i32 addrspace(1)* %out.gep.1.bc
-  store float 2.0, float addrspace(1)* %out.gep.2
-  store i32 17, i32 addrspace(1)* %out.gep.3.bc
-  store float 8.0, float addrspace(1)* %out
+  store i32 11, ptr addrspace(1) %out.gep.1
+  store float 2.0, ptr addrspace(1) %out.gep.2
+  store i32 17, ptr addrspace(1) %out.gep.3
+  store float 8.0, ptr addrspace(1) %out
   ret void
 }
 
@@ -169,108 +165,108 @@ define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float ad
 ; CI-DAG: buffer_store_dwordx3
 ; GCN-NOT: buffer_store_dword
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
+define amdgpu_kernel void @merge_global_store_3_constants_i32(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
 
-  store i32 123, i32 addrspace(1)* %out.gep.1
-  store i32 456, i32 addrspace(1)* %out.gep.2
-  store i32 1234, i32 addrspace(1)* %out
+  store i32 123, ptr addrspace(1) %out.gep.1
+  store i32 456, ptr addrspace(1) %out.gep.2
+  store i32 1234, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
+define amdgpu_kernel void @merge_global_store_2_constants_i64(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
 
-  store i64 123, i64 addrspace(1)* %out.gep.1
-  store i64 456, i64 addrspace(1)* %out
+  store i64 123, ptr addrspace(1) %out.gep.1
+  store i64 456, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
-  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
-  %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
-  %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
+define amdgpu_kernel void @merge_global_store_4_constants_i64(ptr addrspace(1) %out) #0 {
+  %out.gep.1 = getelementptr i64, ptr addrspace(1) %out, i64 1
+  %out.gep.2 = getelementptr i64, ptr addrspace(1) %out, i64 2
+  %out.gep.3 = getelementptr i64, ptr addrspace(1) %out, i64 3
 
-  store i64 123, i64 addrspace(1)* %out.gep.1
-  store i64 456, i64 addrspace(1)* %out.gep.2
-  store i64 333, i64 addrspace(1)* %out.gep.3
-  store i64 1234, i64 addrspace(1)* %out
+  store i64 123, ptr addrspace(1) %out.gep.1
+  store i64 456, ptr addrspace(1) %out.gep.2
+  store i64 333, ptr addrspace(1) %out.gep.3
+  store i64 1234, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx2 [[LOAD]]
-define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
 
-  %lo = load i32, i32 addrspace(1)* %in
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
+  %lo = load i32, ptr addrspace(1) %in
+  %hi = load i32, ptr addrspace(1) %in.gep.1
 
-  store i32 %lo, i32 addrspace(1)* %out
-  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  store i32 %lo, ptr addrspace(1) %out
+  store i32 %hi, ptr addrspace(1) %out.gep.1
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 2
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 3
 
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %lo = load i32, i32 addrspace(1)* %in.gep.0
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
+  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 3
+  %lo = load i32, ptr addrspace(1) %in.gep.0
+  %hi = load i32, ptr addrspace(1) %in.gep.1
 
-  store i32 %lo, i32 addrspace(1)* %out.gep.0
-  store i32 %hi, i32 addrspace(1)* %out.gep.1
+  store i32 %lo, ptr addrspace(1) %out.gep.0
+  store i32 %hi, ptr addrspace(1) %out.gep.1
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
 ; GCN: buffer_load_dwordx2 v
 ; GCN: buffer_store_dwordx2 v
-define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
 
-  %lo = load i32, i32 addrspace(1)* %in
-  %hi = load i32, i32 addrspace(1)* %in.gep.1
+  %lo = load i32, ptr addrspace(1) %in
+  %hi = load i32, ptr addrspace(1) %in.gep.1
 
-  store i32 %hi, i32 addrspace(1)* %out
-  store i32 %lo, i32 addrspace(1)* %out.gep.1
+  store i32 %hi, ptr addrspace(1) %out
+  store i32 %lo, ptr addrspace(1) %out.gep.1
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
+  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
 
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
+  %x = load i32, ptr addrspace(1) %in
+  %y = load i32, ptr addrspace(1) %in.gep.1
+  %z = load i32, ptr addrspace(1) %in.gep.2
+  %w = load i32, ptr addrspace(1) %in.gep.3
 
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
+  store i32 %x, ptr addrspace(1) %out
+  store i32 %y, ptr addrspace(1) %out.gep.1
+  store i32 %z, ptr addrspace(1) %out.gep.2
+  store i32 %w, ptr addrspace(1) %out.gep.3
   ret void
 }
 
@@ -283,67 +279,67 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace
 ; SI-DAG: buffer_store_dword v
 ; CI-DAG: buffer_store_dwordx3
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
 
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
+  %x = load i32, ptr addrspace(1) %in
+  %y = load i32, ptr addrspace(1) %in.gep.1
+  %z = load i32, ptr addrspace(1) %in.gep.2
 
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
+  store i32 %x, ptr addrspace(1) %out
+  store i32 %y, ptr addrspace(1) %out.gep.1
+  store i32 %z, ptr addrspace(1) %out.gep.2
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
+  %in.gep.1 = getelementptr float, ptr addrspace(1) %in, i32 1
+  %in.gep.2 = getelementptr float, ptr addrspace(1) %in, i32 2
+  %in.gep.3 = getelementptr float, ptr addrspace(1) %in, i32 3
 
-  %x = load float, float addrspace(1)* %in
-  %y = load float, float addrspace(1)* %in.gep.1
-  %z = load float, float addrspace(1)* %in.gep.2
-  %w = load float, float addrspace(1)* %in.gep.3
+  %x = load float, ptr addrspace(1) %in
+  %y = load float, ptr addrspace(1) %in.gep.1
+  %z = load float, ptr addrspace(1) %in.gep.2
+  %w = load float, ptr addrspace(1) %in.gep.3
 
-  store float %x, float addrspace(1)* %out
-  store float %y, float addrspace(1)* %out.gep.1
-  store float %z, float addrspace(1)* %out.gep.2
-  store float %w, float addrspace(1)* %out.gep.3
+  store float %x, ptr addrspace(1) %out
+  store float %y, ptr addrspace(1) %out.gep.1
+  store float %z, ptr addrspace(1) %out.gep.2
+  store float %w, ptr addrspace(1) %out.gep.3
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 ; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
-  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %in.gep.0 = getelementptr i32, ptr addrspace(1) %in, i32 11
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 12
+  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 13
+  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 14
+  %out.gep.0 = getelementptr i32, ptr addrspace(1) %out, i32 7
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 8
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 9
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 10
 
-  %x = load i32, i32 addrspace(1)* %in.gep.0
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
+  %x = load i32, ptr addrspace(1) %in.gep.0
+  %y = load i32, ptr addrspace(1) %in.gep.1
+  %z = load i32, ptr addrspace(1) %in.gep.2
+  %w = load i32, ptr addrspace(1) %in.gep.3
 
-  store i32 %x, i32 addrspace(1)* %out.gep.0
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
+  store i32 %x, ptr addrspace(1) %out.gep.0
+  store i32 %y, ptr addrspace(1) %out.gep.1
+  store i32 %z, ptr addrspace(1) %out.gep.2
+  store i32 %w, ptr addrspace(1) %out.gep.3
   ret void
 }
 
@@ -351,26 +347,26 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: s_barrier
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
+  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
+
+  %x = load i32, ptr addrspace(1) %in
+  %y = load i32, ptr addrspace(1) %in.gep.1
+  %z = load i32, ptr addrspace(1) %in.gep.2
+  %w = load i32, ptr addrspace(1) %in.gep.3
 
   ; Make sure the barrier doesn't stop this
   tail call void @llvm.amdgcn.s.barrier() #1
 
-  store i32 %w, i32 addrspace(1)* %out.gep.3
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %x, i32 addrspace(1)* %out
+  store i32 %w, ptr addrspace(1) %out.gep.3
+  store i32 %z, ptr addrspace(1) %out.gep.2
+  store i32 %y, ptr addrspace(1) %out.gep.1
+  store i32 %x, ptr addrspace(1) %out
 
   ret void
 }
@@ -382,26 +378,26 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 a
 ; GCN: buffer_load_dwordx4 v
 ; GCN: s_barrier
 ; GCN: buffer_store_dwordx4 v
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
-  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
-
-  %x = load i32, i32 addrspace(1)* %in
-  %y = load i32, i32 addrspace(1)* %in.gep.1
-  %z = load i32, i32 addrspace(1)* %in.gep.2
-  %w = load i32, i32 addrspace(1)* %in.gep.3
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
+  %in.gep.1 = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %in.gep.2 = getelementptr i32, ptr addrspace(1) %in, i32 2
+  %in.gep.3 = getelementptr i32, ptr addrspace(1) %in, i32 3
+
+  %x = load i32, ptr addrspace(1) %in
+  %y = load i32, ptr addrspace(1) %in.gep.1
+  %z = load i32, ptr addrspace(1) %in.gep.2
+  %w = load i32, ptr addrspace(1) %in.gep.3
 
   ; Make sure the barrier doesn't stop this
   tail call void @llvm.amdgcn.s.barrier() #1
 
-  store i32 %w, i32 addrspace(1)* %out
-  store i32 %z, i32 addrspace(1)* %out.gep.1
-  store i32 %y, i32 addrspace(1)* %out.gep.2
-  store i32 %x, i32 addrspace(1)* %out.gep.3
+  store i32 %w, ptr addrspace(1) %out
+  store i32 %z, ptr addrspace(1) %out.gep.1
+  store i32 %y, ptr addrspace(1) %out.gep.2
+  store i32 %x, ptr addrspace(1) %out.gep.3
 
   ret void
 }
@@ -410,23 +406,23 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 a
 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 ; GCN: buffer_store_dword [[LOAD]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
-  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
-  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
+  %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
+  %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
+  %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
+  %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
 
-  %x = load i8, i8 addrspace(1)* %in, align 4
-  %y = load i8, i8 addrspace(1)* %in.gep.1
-  %z = load i8, i8 addrspace(1)* %in.gep.2
-  %w = load i8, i8 addrspace(1)* %in.gep.3
+  %x = load i8, ptr addrspace(1) %in, align 4
+  %y = load i8, ptr addrspace(1) %in.gep.1
+  %z = load i8, ptr addrspace(1) %in.gep.2
+  %w = load i8, ptr addrspace(1) %in.gep.3
 
-  store i8 %x, i8 addrspace(1)* %out, align 4
-  store i8 %y, i8 addrspace(1)* %out.gep.1
-  store i8 %z, i8 addrspace(1)* %out.gep.2
-  store i8 %w, i8 addrspace(1)* %out.gep.3
+  store i8 %x, ptr addrspace(1) %out, align 4
+  store i8 %y, ptr addrspace(1) %out.gep.1
+  store i8 %z, ptr addrspace(1) %out.gep.2
+  store i8 %w, ptr addrspace(1) %out.gep.3
   ret void
 }
 
@@ -440,23 +436,23 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
-  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
-  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
-  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
-  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
-  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %out.gep.1 = getelementptr i8, ptr addrspace(1) %out, i8 1
+  %out.gep.2 = getelementptr i8, ptr addrspace(1) %out, i8 2
+  %out.gep.3 = getelementptr i8, ptr addrspace(1) %out, i8 3
+  %in.gep.1 = getelementptr i8, ptr addrspace(1) %in, i8 1
+  %in.gep.2 = getelementptr i8, ptr addrspace(1) %in, i8 2
+  %in.gep.3 = getelementptr i8, ptr addrspace(1) %in, i8 3
 
-  %x = load i8, i8 addrspace(1)* %in
-  %y = load i8, i8 addrspace(1)* %in.gep.1
-  %z = load i8, i8 addrspace(1)* %in.gep.2
-  %w = load i8, i8 addrspace(1)* %in.gep.3
+  %x = load i8, ptr addrspace(1) %in
+  %y = load i8, ptr addrspace(1) %in.gep.1
+  %z = load i8, ptr addrspace(1) %in.gep.2
+  %w = load i8, ptr addrspace(1) %in.gep.3
 
-  store i8 %x, i8 addrspace(1)* %out
-  store i8 %y, i8 addrspace(1)* %out.gep.1
-  store i8 %z, i8 addrspace(1)* %out.gep.2
-  store i8 %w, i8 addrspace(1)* %out.gep.3
+  store i8 %x, ptr addrspace(1) %out
+  store i8 %y, ptr addrspace(1) %out.gep.1
+  store i8 %z, ptr addrspace(1) %out.gep.2
+  store i8 %w, ptr addrspace(1) %out.gep.3
   ret void
 }
 
@@ -464,32 +460,32 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
-  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(1) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(1) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(1) %out, i32 3
+  %vec = load <4 x i32>, ptr addrspace(1) %in
 
   %x = extractelement <4 x i32> %vec, i32 0
   %y = extractelement <4 x i32> %vec, i32 1
   %z = extractelement <4 x i32> %vec, i32 2
   %w = extractelement <4 x i32> %vec, i32 3
 
-  store i32 %x, i32 addrspace(1)* %out
-  store i32 %y, i32 addrspace(1)* %out.gep.1
-  store i32 %z, i32 addrspace(1)* %out.gep.2
-  store i32 %w, i32 addrspace(1)* %out.gep.3
+  store i32 %x, ptr addrspace(1) %out
+  store i32 %y, ptr addrspace(1) %out.gep.1
+  store i32 %z, ptr addrspace(1) %out.gep.2
+  store i32 %w, ptr addrspace(1) %out.gep.3
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
 ; GCN: ds_write_b16
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
-  %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
+define amdgpu_kernel void @merge_local_store_2_constants_i8(ptr addrspace(3) %out) #0 {
+  %out.gep.1 = getelementptr i8, ptr addrspace(3) %out, i32 1
 
-  store i8 123, i8 addrspace(3)* %out.gep.1
-  store i8 456, i8 addrspace(3)* %out, align 2
+  store i8 123, ptr addrspace(3) %out.gep.1
+  store i8 456, ptr addrspace(3) %out, align 2
   ret void
 }
 
@@ -497,11 +493,11 @@ define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %ou
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
-define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+define amdgpu_kernel void @merge_local_store_2_constants_i32(ptr addrspace(3) %out) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
 
-  store i32 123, i32 addrspace(3)* %out.gep.1
-  store i32 456, i32 addrspace(3)* %out
+  store i32 123, ptr addrspace(3) %out.gep.1
+  store i32 456, ptr addrspace(3) %out
   ret void
 }
 
@@ -515,15 +511,15 @@ define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
 
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
-  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
-  %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
-  %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
+define amdgpu_kernel void @merge_local_store_4_constants_i32(ptr addrspace(3) %out) #0 {
+  %out.gep.1 = getelementptr i32, ptr addrspace(3) %out, i32 1
+  %out.gep.2 = getelementptr i32, ptr addrspace(3) %out, i32 2
+  %out.gep.3 = getelementptr i32, ptr addrspace(3) %out, i32 3
 
-  store i32 123, i32 addrspace(3)* %out.gep.1
-  store i32 456, i32 addrspace(3)* %out.gep.2
-  store i32 333, i32 addrspace(3)* %out.gep.3
-  store i32 1234, i32 addrspace(3)* %out
+  store i32 123, ptr addrspace(3) %out.gep.1
+  store i32 456, ptr addrspace(3) %out.gep.2
+  store i32 333, ptr addrspace(3) %out.gep.3
+  store i32 1234, ptr addrspace(3) %out
   ret void
 }
 
@@ -533,34 +529,34 @@ define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %
 ; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI4]]]
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
 ; GCN: buffer_store_dword v[[HI]]
-define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
-  store i32 9, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 12, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 16, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 -12, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 11, i32 addrspace(1)* %idx4, align 4
+define amdgpu_kernel void @merge_global_store_5_constants_i32(ptr addrspace(1) %out) {
+  store i32 9, ptr addrspace(1) %out, align 4
+  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
+  store i32 12, ptr addrspace(1) %idx1, align 4
+  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
+  store i32 16, ptr addrspace(1) %idx2, align 4
+  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
+  store i32 -12, ptr addrspace(1) %idx3, align 4
+  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
+  store i32 11, ptr addrspace(1) %idx4, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
-  store i32 13, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 15, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 62, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 63, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 11, i32 addrspace(1)* %idx4, align 4
-  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
-  store i32 123, i32 addrspace(1)* %idx5, align 4
+define amdgpu_kernel void @merge_global_store_6_constants_i32(ptr addrspace(1) %out) {
+  store i32 13, ptr addrspace(1) %out, align 4
+  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
+  store i32 15, ptr addrspace(1) %idx1, align 4
+  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
+  store i32 62, ptr addrspace(1) %idx2, align 4
+  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
+  store i32 63, ptr addrspace(1) %idx3, align 4
+  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
+  store i32 11, ptr addrspace(1) %idx4, align 4
+  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
+  store i32 123, ptr addrspace(1) %idx5, align 4
   ret void
 }
 
@@ -568,20 +564,20 @@ define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)*
 ; GCN: buffer_store_dwordx4
 ; SI-DAG: buffer_store_dwordx2
 ; CI: buffer_store_dwordx3
-define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
-  store i32 34, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 999, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 65, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 33, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 98, i32 addrspace(1)* %idx4, align 4
-  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
-  store i32 91, i32 addrspace(1)* %idx5, align 4
-  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
-  store i32 212, i32 addrspace(1)* %idx6, align 4
+define amdgpu_kernel void @merge_global_store_7_constants_i32(ptr addrspace(1) %out) {
+  store i32 34, ptr addrspace(1) %out, align 4
+  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
+  store i32 999, ptr addrspace(1) %idx1, align 4
+  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
+  store i32 65, ptr addrspace(1) %idx2, align 4
+  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
+  store i32 33, ptr addrspace(1) %idx3, align 4
+  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
+  store i32 98, ptr addrspace(1) %idx4, align 4
+  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
+  store i32 91, ptr addrspace(1) %idx5, align 4
+  %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
+  store i32 212, ptr addrspace(1) %idx6, align 4
   ret void
 }
 
@@ -589,22 +585,22 @@ define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)*
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
-  store i32 34, i32 addrspace(1)* %out, align 4
-  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 999, i32 addrspace(1)* %idx1, align 4
-  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
-  store i32 65, i32 addrspace(1)* %idx2, align 4
-  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
-  store i32 33, i32 addrspace(1)* %idx3, align 4
-  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
-  store i32 98, i32 addrspace(1)* %idx4, align 4
-  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
-  store i32 91, i32 addrspace(1)* %idx5, align 4
-  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
-  store i32 212, i32 addrspace(1)* %idx6, align 4
-  %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
-  store i32 999, i32 addrspace(1)* %idx7, align 4
+define amdgpu_kernel void @merge_global_store_8_constants_i32(ptr addrspace(1) %out) {
+  store i32 34, ptr addrspace(1) %out, align 4
+  %idx1 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
+  store i32 999, ptr addrspace(1) %idx1, align 4
+  %idx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 2
+  store i32 65, ptr addrspace(1) %idx2, align 4
+  %idx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
+  store i32 33, ptr addrspace(1) %idx3, align 4
+  %idx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 4
+  store i32 98, ptr addrspace(1) %idx4, align 4
+  %idx5 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 5
+  store i32 91, ptr addrspace(1) %idx5, align 4
+  %idx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 6
+  store i32 212, ptr addrspace(1) %idx6, align 4
+  %idx7 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 7
+  store i32 999, ptr addrspace(1) %idx7, align 4
   ret void
 }
 
@@ -625,9 +621,9 @@ define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)*
 ; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
 ; GCN: ScratchSize: 0{{$}}
-define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
-  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
-  store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
+define amdgpu_kernel void @copy_v3i32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+  %vec = load <3 x i32>, ptr addrspace(1) %in, align 4
+  store <3 x i32> %vec, ptr addrspace(1) %out
   ret void
 }
 
@@ -641,9 +637,9 @@ define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %ou
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
-define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
-  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
-  store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
+define amdgpu_kernel void @copy_v3i64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+  %vec = load <3 x i64>, ptr addrspace(1) %in, align 4
+  store <3 x i64> %vec, ptr addrspace(1) %out
   ret void
 }
 
@@ -659,10 +655,10 @@ define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %ou
 ; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: ScratchSize: 0{{$}}
-define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
+define amdgpu_kernel void @copy_v3f32_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+  %vec = load <3 x float>, ptr addrspace(1) %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
-  store <3 x float> %fadd, <3 x float> addrspace(1)* %out
+  store <3 x float> %fadd, ptr addrspace(1) %out
   ret void
 }
 
@@ -676,10 +672,10 @@ define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
-define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
-  %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
+define amdgpu_kernel void @copy_v3f64_align4(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+  %vec = load <3 x double>, ptr addrspace(1) %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
-  store <3 x double> %fadd, <3 x double> addrspace(1)* %out
+  store <3 x double> %fadd, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
index fc8ffb31cda6..7f0f473c11bd 100644
--- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
@@ -16,10 +16,10 @@
 ; GCN-DAG: s_mov_b32 s7, 0xe8f000
 ; GCN-DAG: v_mov_b32_e32 [[V:v[0-9]+]], 2
 ; GCN: buffer_store_dword [[V]], v0, s[4:7], 0 offen
-define amdgpu_ps void @scratch_ps(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_ps void @scratch_ps(ptr addrspace(1) %out, i32 %in) {
 entry:
   %alloca = alloca [32 x i32], addrspace(5)
-  %ptr = getelementptr [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %in
-  store volatile i32 2, i32 addrspace(5)* %ptr
+  %ptr = getelementptr [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %in
+  store volatile i32 2, ptr addrspace(5) %ptr
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll
index 650eddf900ea..9e711bad7e16 100644
--- a/llvm/test/CodeGen/AMDGPU/missing-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
 
- at ptr_load = addrspace(3) global i32 addrspace(4)* undef, align 8
+ at ptr_load = addrspace(3) global ptr addrspace(4) undef, align 8
 
 ; Make sure when the load from %ptr2 is folded the chain isn't lost,
 ; resulting in losing the store to gptr
@@ -14,14 +14,14 @@
 ; SI-DAG: buffer_store_dword
 ; SI-DAG: buffer_store_dword
 ; SI:     s_endpgm
-define amdgpu_kernel void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(3)* @ptr_load, align 8
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 2
+define amdgpu_kernel void @missing_store_reduced(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 {
+  %ptr0 = load ptr addrspace(4), ptr addrspace(3) @ptr_load, align 8
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 2
 
-  store i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4
+  store i32 99, ptr addrspace(1) %gptr, align 4
+  %tmp2 = load i32, ptr addrspace(4) %ptr2, align 4
 
-  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  store i32 %tmp2, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index d93b55a01359..e1cd951cfb5c 100644
--- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -18,15 +18,15 @@
 ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, v[[[PTRLO]]:[[PTRHI]]],
 
-define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, [8 x i32], i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
+define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, [8 x i32], ptr addrspace(1) %ptrarg, i32 %arg3) #0 {
 bb:
   %tmp = icmp sgt i32 %arg3, 0
   br i1 %tmp, label %bb4, label %bb17
 
 bb4:
-  %tmp14 = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %ptrarg
-  %tmp15 = getelementptr inbounds i8, i8 addrspace(1)* %tmp14, i64 %arg1
-  %tmp16 = load volatile i8, i8 addrspace(1)* %tmp15
+  %tmp14 = load volatile ptr addrspace(1), ptr addrspace(1) %ptrarg
+  %tmp15 = getelementptr inbounds i8, ptr addrspace(1) %tmp14, i64 %arg1
+  %tmp16 = load volatile i8, ptr addrspace(1) %tmp15
   br label %bb17
 
 bb17:

diff  --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
index b965623d980e..e9c9983f5df8 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
@@ -11,18 +11,18 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400 glc{{$}}
-define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
-  %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
+  %tid.gep = getelementptr ptr addrspace(1), ptr addrspace(1) %in, i32 %tid
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(1) %tid.gep
   %xor = xor i32 %tid, 1
   %cmp = icmp ne i32 %xor, 0
   br i1 %cmp, label %atomic, label %exit
 
 atomic:
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 100
-  %ret = atomicrmw max i32 addrspace(1)* %gep, i32 %y seq_cst
-  store i32 %ret, i32 addrspace(1)* %out
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 100
+  %ret = atomicrmw max ptr addrspace(1) %gep, i32 %y seq_cst
+  store i32 %ret, ptr addrspace(1) %out
   br label %exit
 
 exit:
@@ -31,17 +31,17 @@ exit:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_noret:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400{{$}}
-define amdgpu_kernel void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+define amdgpu_kernel void @atomic_max_i32_noret(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %x, i32 %y) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
-  %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
+  %tid.gep = getelementptr ptr addrspace(1), ptr addrspace(1) %in, i32 %tid
+  %ptr = load volatile ptr addrspace(1), ptr addrspace(1) %tid.gep
   %xor = xor i32 %tid, 1
   %cmp = icmp ne i32 %xor, 0
   br i1 %cmp, label %atomic, label %exit
 
 atomic:
-  %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 100
-  %ret = atomicrmw max i32 addrspace(1)* %gep, i32 %y seq_cst
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 100
+  %ret = atomicrmw max ptr addrspace(1) %gep, i32 %y seq_cst
   br label %exit
 
 exit:

diff  --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-worklist.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-worklist.ll
index f44dcc0f1a80..7164f6498de1 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-worklist.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-worklist.ll
@@ -13,7 +13,7 @@
 ; GCN-NEXT: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @in_worklist_once() #0 {
 bb:
-	%tmp = load i64, i64 addrspace(5)* undef
+	%tmp = load i64, ptr addrspace(5) undef
 br label %bb1
 
 bb1:                                              ; preds = %bb1, %bb

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 3d597b998a65..bccdbfc636d7 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -122,12 +122,12 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W32-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES0]], off
 ; W32-DAG: global_store_{{dword|b32}} v{{\[[0-9]+:[0-9]+\]}}, [[RES1]], off
 
-define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %out0, float addrspace(1)* %out1) #0 {
+define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr addrspace(1) %out0, ptr addrspace(1) %out1) #0 {
 entry:
   %val0 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i32 0, i32 0) #1
   %val1 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %j, i32 %c, i32 0, i32 0, i32 0) #1
-  store volatile float %val0, float addrspace(1)* %out0
-  store volatile float %val1, float addrspace(1)* %out1
+  store volatile float %val0, ptr addrspace(1) %out0
+  store volatile float %val1, ptr addrspace(1) %out1
   ret void
 }
 
@@ -307,7 +307,7 @@ entry:
 ; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Reload
 ; W64-O0: global_store_dword v[{{[0-9]+:[0-9]+}}], [[RES]], off
 
-define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %in, float addrspace(1)* %out) #0 {
+define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 entry:
   %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
   %val0 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %live.out.reg, i32 0, i32 0, i32 0) #1
@@ -321,7 +321,7 @@ bb1:
 
 bb2:
   %val = phi float [ %val0, %entry ], [ %val1, %bb1 ]
-  store volatile float %val, float addrspace(1)* %out
+  store volatile float %val, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
index f984bb49c7b7..e93a8ad33368 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
@@ -7,113 +7,113 @@
 ; GCN-LABEL: {{^}}store_private_offset_i8:
 ; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @store_private_offset_i8() #0 {
-  store volatile i8 5, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*)
+  store volatile i8 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i16:
 ; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @store_private_offset_i16() #0 {
-  store volatile i16 5, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*)
+  store volatile i16 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i32:
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @store_private_offset_i32() #0 {
-  store volatile i32 5, i32 addrspace(5)* inttoptr (i32 8 to i32 addrspace(5)*)
+  store volatile i32 5, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_v2i32:
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @store_private_offset_v2i32() #0 {
-  store volatile <2 x i32> <i32 5, i32 10>, <2 x i32> addrspace(5)* inttoptr (i32 8 to <2 x i32> addrspace(5)*)
+  store volatile <2 x i32> <i32 5, i32 10>, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_v4i32:
 ; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @store_private_offset_v4i32() #0 {
-  store volatile <4 x i32> <i32 5, i32 10, i32 15, i32 0>, <4 x i32> addrspace(5)* inttoptr (i32 8 to <4 x i32> addrspace(5)*)
+  store volatile <4 x i32> <i32 5, i32 10, i32 15, i32 0>, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i8:
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @load_private_offset_i8() #0 {
-  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*)
+  %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   ret void
 }
 
 ; GCN-LABEL: {{^}}sextload_private_offset_i8:
 ; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], 0 offset:8
-define amdgpu_kernel void @sextload_private_offset_i8(i32 addrspace(1)* %out) #0 {
-  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*)
+define amdgpu_kernel void @sextload_private_offset_i8(ptr addrspace(1) %out) #0 {
+  %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   %sextload = sext i8 %load to i32
-  store i32 %sextload, i32 addrspace(1)* undef
+  store i32 %sextload, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}zextload_private_offset_i8:
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8
-define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0 {
-  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*)
+define amdgpu_kernel void @zextload_private_offset_i8(ptr addrspace(1) %out) #0 {
+  %load = load volatile i8, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   %zextload = zext i8 %load to i32
-  store i32 %zextload, i32 addrspace(1)* undef
+  store i32 %zextload, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i16:
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @load_private_offset_i16() #0 {
-  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*)
+  %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   ret void
 }
 
 ; GCN-LABEL: {{^}}sextload_private_offset_i16:
 ; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], 0 offset:8
-define amdgpu_kernel void @sextload_private_offset_i16(i32 addrspace(1)* %out) #0 {
-  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*)
+define amdgpu_kernel void @sextload_private_offset_i16(ptr addrspace(1) %out) #0 {
+  %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   %sextload = sext i16 %load to i32
-  store i32 %sextload, i32 addrspace(1)* undef
+  store i32 %sextload, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}zextload_private_offset_i16:
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8
-define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) #0 {
-  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*)
+define amdgpu_kernel void @zextload_private_offset_i16(ptr addrspace(1) %out) #0 {
+  %load = load volatile i16, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   %zextload = zext i16 %load to i32
-  store i32 %zextload, i32 addrspace(1)* undef
+  store i32 %zextload, ptr addrspace(1) undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_i32:
 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @load_private_offset_i32() #0 {
-  %load = load volatile i32, i32 addrspace(5)* inttoptr (i32 8 to i32 addrspace(5)*)
+  %load = load volatile i32, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_v2i32:
 ; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @load_private_offset_v2i32() #0 {
-  %load = load volatile <2 x i32>, <2 x i32> addrspace(5)* inttoptr (i32 8 to <2 x i32> addrspace(5)*)
+  %load = load volatile <2 x i32>, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_private_offset_v4i32:
 ; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8
 define amdgpu_kernel void @load_private_offset_v4i32() #0 {
-  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* inttoptr (i32 8 to <4 x i32> addrspace(5)*)
+  %load = load volatile <4 x i32>, ptr addrspace(5) inttoptr (i32 8 to ptr addrspace(5))
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset:
 ; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:4095
 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 {
-  store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4095 to i8 addrspace(5)*)
+  store volatile i8 5, ptr addrspace(5) inttoptr (i32 4095 to ptr addrspace(5))
   ret void
 }
 
@@ -121,7 +121,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 {
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
 ; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen{{$}}
 define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 {
-  store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4096 to i8 addrspace(5)*)
+  store volatile i8 5, ptr addrspace(5) inttoptr (i32 4096 to ptr addrspace(5))
   ret void
 }
 
@@ -129,7 +129,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 {
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000
 ; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen offset:1{{$}}
 define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
-  store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4097 to i8 addrspace(5)*)
+  store volatile i8 5, ptr addrspace(5) inttoptr (i32 4097 to ptr addrspace(5))
   ret void
 }
 
@@ -145,10 +145,10 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
 ; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32
 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 {
   %alloca = alloca [16 x i32], align 4, addrspace(5)
-  %vaddr = load volatile i32, i32 addrspace(1)* undef
+  %vaddr = load volatile i32, ptr addrspace(1) undef
   %vaddr.off = add i32 %vaddr, 8
-  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %vaddr.off
-  store volatile i32 9, i32 addrspace(5)* %gep
+  %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %vaddr.off
+  store volatile i32 9, ptr addrspace(5) %gep
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
index 9057bfb2a49d..255c8fafd7bf 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
@@ -5,40 +5,40 @@
 
 ; CHECK-LABEL: {{^}}test_none:
 ; CHECK: buffer_load_format_x v0, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define amdgpu_vs float @test_none(<4 x i32> addrspace(4)* inreg %base, i32 %i) {
+define amdgpu_vs float @test_none(ptr addrspace(4) inreg %base, i32 %i) {
 main_body:
-  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i
-  %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32
+  %ptr = getelementptr <4 x i32>, ptr addrspace(4) %base, i32 %i
+  %tmp2 = load <4 x i32>, ptr addrspace(4) %ptr, align 32
   %tmp7 = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 0, i32 0)
   ret float %tmp7
 }
 
 ; CHECK-LABEL: {{^}}test_idxen:
 ; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen{{$}}
-define amdgpu_vs float @test_idxen(<4 x i32> addrspace(4)* inreg %base, i32 %i) {
+define amdgpu_vs float @test_idxen(ptr addrspace(4) inreg %base, i32 %i) {
 main_body:
-  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i
-  %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32
+  %ptr = getelementptr <4 x i32>, ptr addrspace(4) %base, i32 %i
+  %tmp2 = load <4 x i32>, ptr addrspace(4) %ptr, align 32
   %tmp7 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i32 0, i32 0)
   ret float %tmp7
 }
 
 ; CHECK-LABEL: {{^}}test_offen:
 ; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
-define amdgpu_vs float @test_offen(<4 x i32> addrspace(4)* inreg %base, i32 %i) {
+define amdgpu_vs float @test_offen(ptr addrspace(4) inreg %base, i32 %i) {
 main_body:
-  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i
-  %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32
+  %ptr = getelementptr <4 x i32>, ptr addrspace(4) %base, i32 %i
+  %tmp2 = load <4 x i32>, ptr addrspace(4) %ptr, align 32
   %tmp7 = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i32 0)
   ret float %tmp7
 }
 
 ; CHECK-LABEL: {{^}}test_both:
 ; CHECK: buffer_load_format_x v0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen{{$}}
-define amdgpu_vs float @test_both(<4 x i32> addrspace(4)* inreg %base, i32 %i) {
+define amdgpu_vs float @test_both(ptr addrspace(4) inreg %base, i32 %i) {
 main_body:
-  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %base, i32 %i
-  %tmp2 = load <4 x i32>, <4 x i32> addrspace(4)* %ptr, align 32
+  %ptr = getelementptr <4 x i32>, ptr addrspace(4) %base, i32 %i
+  %tmp2 = load <4 x i32>, ptr addrspace(4) %ptr, align 32
   %tmp7 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 undef, i32 0, i32 0)
   ret float %tmp7
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll
index 2d7b8c914b7c..c8ea6f023fd0 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll
@@ -7,22 +7,22 @@
 ; MUBUF load with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_load0:
 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
-define amdgpu_kernel void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load0(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
-  %1 = load i32, i32 addrspace(1)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %0 = getelementptr i32, ptr addrspace(1) %in, i64 1
+  %1 = load i32, ptr addrspace(1) %0
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
 ; MUBUF load with the largest possible immediate offset
 ; CHECK-LABEL: {{^}}mubuf_load1:
 ; CHECK: buffer_load_ubyte v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
-define amdgpu_kernel void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095
-  %1 = load i8, i8 addrspace(1)* %0
-  store i8 %1, i8 addrspace(1)* %out
+  %0 = getelementptr i8, ptr addrspace(1) %in, i64 4095
+  %1 = load i8, ptr addrspace(1) %0
+  store i8 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -30,11 +30,11 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_load2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
-define amdgpu_kernel void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024
-  %1 = load i32, i32 addrspace(1)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %0 = getelementptr i32, ptr addrspace(1) %in, i64 1024
+  %1 = load i32, ptr addrspace(1) %0
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -42,21 +42,20 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_load3:
 ; CHECK-NOT: ADD
 ; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0
-define amdgpu_kernel void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
+define amdgpu_kernel void @mubuf_load3(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %offset) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset
-  %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
-  %2 = load i32, i32 addrspace(1)* %1
-  store i32 %2, i32 addrspace(1)* %out
+  %0 = getelementptr i32, ptr addrspace(1) %in, i64 %offset
+  %1 = getelementptr i32, ptr addrspace(1) %0, i64 1
+  %2 = load i32, ptr addrspace(1) %1
+  store i32 %2, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: {{^}}soffset_max_imm:
 ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc
-define amdgpu_gs void @soffset_max_imm([6 x <4 x i32>] addrspace(4)* inreg, [17 x <4 x i32>] addrspace(4)* inreg, [16 x <4 x i32>] addrspace(4)* inreg, [32 x <8 x i32>] addrspace(4)* inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
+define amdgpu_gs void @soffset_max_imm(ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
 main_body:
-  %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0
-  %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0
+  %tmp1 = load <4 x i32>, ptr addrspace(4) %0
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1)
   %tmp4 = add i32 %6, 16
@@ -72,10 +71,9 @@ main_body:
 ; CHECK-LABEL: {{^}}soffset_no_fold:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41
 ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc
-define amdgpu_gs void @soffset_no_fold([6 x <4 x i32>] addrspace(4)* inreg, [17 x <4 x i32>] addrspace(4)* inreg, [16 x <4 x i32>] addrspace(4)* inreg, [32 x <8 x i32>] addrspace(4)* inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
+define amdgpu_gs void @soffset_no_fold(ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, ptr addrspace(4) inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
 main_body:
-  %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0
-  %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0
+  %tmp1 = load <4 x i32>, ptr addrspace(4) %0
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1)
   %tmp4 = add i32 %6, 16
@@ -91,10 +89,10 @@ main_body:
 ; MUBUF store with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_store0:
 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
-define amdgpu_kernel void @mubuf_store0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store0(ptr addrspace(1) %out) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1
-  store i32 0, i32 addrspace(1)* %0
+  %0 = getelementptr i32, ptr addrspace(1) %out, i64 1
+  store i32 0, ptr addrspace(1) %0
   ret void
 }
 
@@ -102,10 +100,10 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store1:
 ; CHECK: buffer_store_byte v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
 
-define amdgpu_kernel void @mubuf_store1(i8 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store1(ptr addrspace(1) %out) {
 entry:
-  %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095
-  store i8 0, i8 addrspace(1)* %0
+  %0 = getelementptr i8, ptr addrspace(1) %out, i64 4095
+  store i8 0, ptr addrspace(1) %0
   ret void
 }
 
@@ -113,10 +111,10 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
-define amdgpu_kernel void @mubuf_store2(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store2(ptr addrspace(1) %out) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024
-  store i32 0, i32 addrspace(1)* %0
+  %0 = getelementptr i32, ptr addrspace(1) %out, i64 1024
+  store i32 0, ptr addrspace(1) %0
   ret void
 }
 
@@ -124,53 +122,53 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store3:
 ; CHECK-NOT: ADD
 ; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0
-define amdgpu_kernel void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
+define amdgpu_kernel void @mubuf_store3(ptr addrspace(1) %out, i64 %offset) {
 entry:
-  %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset
-  %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
-  store i32 0, i32 addrspace(1)* %1
+  %0 = getelementptr i32, ptr addrspace(1) %out, i64 %offset
+  %1 = getelementptr i32, ptr addrspace(1) %0, i64 1
+  store i32 0, ptr addrspace(1) %1
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
-define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) {
-  store i32 99, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @store_sgpr_ptr(ptr addrspace(1) %out) {
+  store i32 99, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
-define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) {
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
-  store i32 99, i32 addrspace(1)* %out.gep, align 4
+define amdgpu_kernel void @store_sgpr_ptr_offset(ptr addrspace(1) %out) {
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 10
+  store i32 99, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) {
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
-  store i32 99, i32 addrspace(1)* %out.gep, align 4
+define amdgpu_kernel void @store_sgpr_ptr_large_offset(ptr addrspace(1) %out) {
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 32768
+  store i32 99, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) {
-  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
-  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
+define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(ptr addrspace(1) %out) {
+  %gep = getelementptr i32, ptr addrspace(1) %out, i32 32768
+  %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 5 seq_cst
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_vgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_vgpr_ptr(ptr addrspace(1) %out) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
-  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
+  store i32 99, ptr addrspace(1) %out.gep, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/nand.ll b/llvm/test/CodeGen/AMDGPU/nand.ll
index 6084b74cd86a..200b67ea7e68 100644
--- a/llvm/test/CodeGen/AMDGPU/nand.ll
+++ b/llvm/test/CodeGen/AMDGPU/nand.ll
@@ -6,11 +6,11 @@
 ; GCN-LABEL: {{^}}scalar_nand_i32_one_use
 ; GCN: s_nand_b32
 define amdgpu_kernel void @scalar_nand_i32_one_use(
-    i32 addrspace(1)* %r0, i32 %a, i32 %b) {
+    ptr addrspace(1) %r0, i32 %a, i32 %b) {
 entry:
   %and = and i32 %a, %b
   %r0.val = xor i32 %and, -1
-  store i32 %r0.val, i32 addrspace(1)* %r0
+  store i32 %r0.val, ptr addrspace(1) %r0
   ret void
 }
 
@@ -20,24 +20,24 @@ entry:
 ; GCN: s_not_b32
 ; GCN: s_add_i32
 define amdgpu_kernel void @scalar_nand_i32_mul_use(
-    i32 addrspace(1)* %r0, i32 addrspace(1)* %r1, i32 %a, i32 %b) {
+    ptr addrspace(1) %r0, ptr addrspace(1) %r1, i32 %a, i32 %b) {
 entry:
   %and = and i32 %a, %b
   %r0.val = xor i32 %and, -1
   %r1.val = add i32 %and, %a
-  store i32 %r0.val, i32 addrspace(1)* %r0
-  store i32 %r1.val, i32 addrspace(1)* %r1
+  store i32 %r0.val, ptr addrspace(1) %r0
+  store i32 %r1.val, ptr addrspace(1) %r1
   ret void
 }
 
 ; GCN-LABEL: {{^}}scalar_nand_i64_one_use
 ; GCN: s_nand_b64
 define amdgpu_kernel void @scalar_nand_i64_one_use(
-    i64 addrspace(1)* %r0, i64 %a, i64 %b) {
+    ptr addrspace(1) %r0, i64 %a, i64 %b) {
 entry:
   %and = and i64 %a, %b
   %r0.val = xor i64 %and, -1
-  store i64 %r0.val, i64 addrspace(1)* %r0
+  store i64 %r0.val, ptr addrspace(1) %r0
   ret void
 }
 
@@ -48,13 +48,13 @@ entry:
 ; GCN: s_add_u32
 ; GCN: s_addc_u32
 define amdgpu_kernel void @scalar_nand_i64_mul_use(
-    i64 addrspace(1)* %r0, i64 addrspace(1)* %r1, i64 %a, i64 %b) {
+    ptr addrspace(1) %r0, ptr addrspace(1) %r1, i64 %a, i64 %b) {
 entry:
   %and = and i64 %a, %b
   %r0.val = xor i64 %and, -1
   %r1.val = add i64 %and, %a
-  store i64 %r0.val, i64 addrspace(1)* %r0
-  store i64 %r1.val, i64 addrspace(1)* %r1
+  store i64 %r0.val, ptr addrspace(1) %r0
+  store i64 %r1.val, ptr addrspace(1) %r1
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
index dda0eabb4e3f..b9f3981a30b6 100644
--- a/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
@@ -9,9 +9,9 @@
 @extern_const_addrspace = external unnamed_addr addrspace(4) constant [5 x i32], align 4
 
 ; CHECK-DAG: Name: load_extern_const_init
-define amdgpu_kernel void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
-  %val = load i32, i32 addrspace(4)* getelementptr ([5 x i32], [5 x i32] addrspace(4)* @extern_const_addrspace, i64 0, i64 3), align 4
-  store i32 %val, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @load_extern_const_init(ptr addrspace(1) %out) nounwind {
+  %val = load i32, ptr addrspace(4) getelementptr ([5 x i32], ptr addrspace(4) @extern_const_addrspace, i64 0, i64 3), align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -19,8 +19,8 @@ define amdgpu_kernel void @load_extern_const_init(i32 addrspace(1)* %out) nounwi
 @undef_const_addrspace = unnamed_addr addrspace(4) constant [5 x i32] undef, align 4
 
 ; CHECK-DAG: Name: undef_const_addrspace
-define amdgpu_kernel void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
-  %val = load i32, i32 addrspace(4)* getelementptr ([5 x i32], [5 x i32] addrspace(4)* @undef_const_addrspace, i64 0, i64 3), align 4
-  store i32 %val, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @load_undef_const_init(ptr addrspace(1) %out) nounwind {
+  %val = load i32, ptr addrspace(4) getelementptr ([5 x i32], ptr addrspace(4) @undef_const_addrspace, i64 0, i64 3), align 4
+  store i32 %val, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
index 0b9438ab8d46..1edd5206f0f9 100644
--- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
@@ -17,32 +17,32 @@
 ; GCN: s_load_dword s
 ; GCN-NOT: global_load_dword
 ; GCN: global_store_dword
-define amdgpu_kernel void @simple_barrier(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @simple_barrier(ptr addrspace(1) %arg) {
 ; CHECK-LABEL: @simple_barrier(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    fence syncscope("workgroup") acquire
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.wave.barrier()
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
 ; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
-; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2
-; CHECK-NEXT:    store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
+; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
-  %i = load i32, i32 addrspace(1)* %arg, align 4
+  %i = load i32, ptr addrspace(1) %arg, align 4
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
   tail call void @llvm.amdgcn.wave.barrier()
-  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  %i2 = load i32, ptr addrspace(1) %i1, align 4
   %i3 = add i32 %i2, %i
-  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
-  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
+  store i32 %i3, ptr addrspace(1) %i4, align 4
   ret void
 }
 
@@ -55,10 +55,10 @@ bb:
 ; GCN: s_load_dword s
 ; GCN-NOT: global_load_dword
 ; GCN: global_store_dword
-define amdgpu_kernel void @memory_phi_no_clobber(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @memory_phi_no_clobber(ptr addrspace(1) %arg) {
 ; CHECK-LABEL: @memory_phi_no_clobber(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
 ; CHECK:       if.then:
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
@@ -67,15 +67,15 @@ define amdgpu_kernel void @memory_phi_no_clobber(i32 addrspace(1)* %arg) {
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    br label [[IF_END]], !amdgpu.uniform !0
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
 ; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
-; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2
-; CHECK-NEXT:    store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
+; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
-  %i = load i32, i32 addrspace(1)* %arg, align 4
+  %i = load i32, ptr addrspace(1) %arg, align 4
   br i1 undef, label %if.then, label %if.else
 
 if.then:
@@ -87,11 +87,11 @@ if.else:
   br label %if.end
 
 if.end:
-  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  %i2 = load i32, ptr addrspace(1) %i1, align 4
   %i3 = add i32 %i2, %i
-  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
-  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
+  store i32 %i3, ptr addrspace(1) %i4, align 4
   ret void
 }
 
@@ -101,33 +101,33 @@ if.end:
 ; GCN: global_store_dword
 ; GCN: global_load_dword
 ; GCN: global_store_dword
-define amdgpu_kernel void @memory_phi_clobber1(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @memory_phi_clobber1(ptr addrspace(1) %arg) {
 ; CHECK-LABEL: @memory_phi_clobber1(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3
-; CHECK-NEXT:    store i32 1, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[GEP]], align 4
 ; CHECK-NEXT:    br label [[IF_END:%.*]], !amdgpu.uniform !0
 ; CHECK:       if.else:
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    br label [[IF_END]], !amdgpu.uniform !0
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
 ; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
-; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2
-; CHECK-NEXT:    store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
+; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
-  %i = load i32, i32 addrspace(1)* %arg, align 4
+  %i = load i32, ptr addrspace(1) %arg, align 4
   br i1 undef, label %if.then, label %if.else
 
 if.then:
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
-  store i32 1, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
+  store i32 1, ptr addrspace(1) %gep, align 4
   br label %if.end
 
 if.else:
@@ -135,11 +135,11 @@ if.else:
   br label %if.end
 
 if.end:
-  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  %i2 = load i32, ptr addrspace(1) %i1, align 4
   %i3 = add i32 %i2, %i
-  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
-  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
+  store i32 %i3, ptr addrspace(1) %i4, align 4
   ret void
 }
 
@@ -149,28 +149,28 @@ if.end:
 ; GCN: s_barrier
 ; GCN: global_load_dword
 ; GCN: global_store_dword
-define amdgpu_kernel void @memory_phi_clobber2(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @memory_phi_clobber2(ptr addrspace(1) %arg) {
 ; CHECK-LABEL: @memory_phi_clobber2(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
 ; CHECK:       if.then:
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    br label [[IF_END:%.*]], !amdgpu.uniform !0
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3
-; CHECK-NEXT:    store i32 1, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[GEP]], align 4
 ; CHECK-NEXT:    br label [[IF_END]], !amdgpu.uniform !0
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
 ; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
-; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2
-; CHECK-NEXT:    store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
+; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
-  %i = load i32, i32 addrspace(1)* %arg, align 4
+  %i = load i32, ptr addrspace(1) %arg, align 4
   br i1 undef, label %if.then, label %if.else
 
 if.then:
@@ -178,16 +178,16 @@ if.then:
   br label %if.end
 
 if.else:
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
-  store i32 1, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
+  store i32 1, ptr addrspace(1) %gep, align 4
   br label %if.end
 
 if.end:
-  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  %i2 = load i32, ptr addrspace(1) %i1, align 4
   %i3 = add i32 %i2, %i
-  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
-  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
+  store i32 %i3, ptr addrspace(1) %i4, align 4
   ret void
 }
 
@@ -196,32 +196,32 @@ if.end:
 ; GCN: s_load_dword s
 ; GCN-NOT: global_load_dword
 ; GCN: global_store_dword
-define amdgpu_kernel void @no_clobbering_loop1(i32 addrspace(1)* %arg, i1 %cc) {
+define amdgpu_kernel void @no_clobbering_loop1(ptr addrspace(1) %arg, i1 %cc) {
 ; CHECK-LABEL: @no_clobbering_loop1(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
 ; CHECK-NEXT:    br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
 ; CHECK:       while.cond:
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
 ; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
-; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2
-; CHECK-NEXT:    store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
+; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.wave.barrier()
 ; CHECK-NEXT:    br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;
 bb:
-  %i = load i32, i32 addrspace(1)* %arg, align 4
+  %i = load i32, ptr addrspace(1) %arg, align 4
   br label %while.cond
 
 while.cond:
-  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  %i2 = load i32, ptr addrspace(1) %i1, align 4
   %i3 = add i32 %i2, %i
-  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
-  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
+  store i32 %i3, ptr addrspace(1) %i4, align 4
   tail call void @llvm.amdgcn.wave.barrier()
   br i1 %cc, label %while.cond, label %end
 
@@ -234,34 +234,34 @@ end:
 ; GCN: s_load_dword s
 ; GCN-NOT: global_load_dword
 ; GCN: global_store_dword
-define amdgpu_kernel void @no_clobbering_loop2(i32 addrspace(1)* noalias %arg, i32 addrspace(1)* noalias %out, i32 %n) {
+define amdgpu_kernel void @no_clobbering_loop2(ptr addrspace(1) noalias %arg, ptr addrspace(1) noalias %out, i32 %n) {
 ; CHECK-LABEL: @no_clobbering_loop2(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
 ; CHECK-NEXT:    br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
 ; CHECK:       while.cond:
 ; CHECK-NEXT:    [[C:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
 ; CHECK-NEXT:    [[ACC:%.*]] = phi i32 [ [[I]], [[BB]] ], [ [[I3:%.*]], [[WHILE_COND]] ]
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i32 [[C]], !amdgpu.uniform !0
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i32 [[C]], !amdgpu.uniform !0
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
 ; CHECK-NEXT:    [[I3]] = add i32 [[I2]], [[ACC]]
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.wave.barrier()
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[C]], 1
 ; CHECK-NEXT:    [[CC:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[CC]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
 ; CHECK:       end:
-; CHECK-NEXT:    store i32 [[I3]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
-  %i = load i32, i32 addrspace(1)* %arg, align 4
+  %i = load i32, ptr addrspace(1) %arg, align 4
   br label %while.cond
 
 while.cond:
   %c = phi i32 [ 0, %bb ], [ %inc, %while.cond ]
   %acc = phi i32 [ %i, %bb ], [ %i3, %while.cond ]
-  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %c
-  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %c
+  %i2 = load i32, ptr addrspace(1) %i1, align 4
   %i3 = add i32 %i2, %acc
   tail call void @llvm.amdgcn.wave.barrier()
   %inc = add nuw nsw i32 %c, 1
@@ -269,7 +269,7 @@ while.cond:
   br i1 %cc, label %while.cond, label %end
 
 end:
-  store i32 %i3, i32 addrspace(1)* %out, align 4
+  store i32 %i3, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -277,32 +277,32 @@ end:
 ; GCN: s_load_dword s
 ; GCN: global_load_dword
 ; GCN: global_store_dword
-define amdgpu_kernel void @clobbering_loop(i32 addrspace(1)* %arg, i32 addrspace(1)* %out, i1 %cc) {
+define amdgpu_kernel void @clobbering_loop(ptr addrspace(1) %arg, ptr addrspace(1) %out, i1 %cc) {
 ; CHECK-LABEL: @clobbering_loop(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
 ; CHECK-NEXT:    br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
 ; CHECK:       while.cond:
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
 ; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
-; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT:%.*]], i64 1
-; CHECK-NEXT:    store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.wave.barrier()
 ; CHECK-NEXT:    br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;
 bb:
-  %i = load i32, i32 addrspace(1)* %arg, align 4
+  %i = load i32, ptr addrspace(1) %arg, align 4
   br label %while.cond
 
 while.cond:
-  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  %i2 = load i32, ptr addrspace(1) %i1, align 4
   %i3 = add i32 %i2, %i
-  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
-  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  %i4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
+  store i32 %i3, ptr addrspace(1) %i4, align 4
   tail call void @llvm.amdgcn.wave.barrier()
   br i1 %cc, label %while.cond, label %end
 
@@ -315,28 +315,28 @@ end:
 ; GCN: global_load_dword {{.*}} glc
 ; GCN: global_load_dword
 ; GCN: global_store_dword
-define amdgpu_kernel void @clobber_by_atomic_load(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @clobber_by_atomic_load(ptr addrspace(1) %arg) {
 ; CHECK-LABEL: @clobber_by_atomic_load(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2, !amdgpu.uniform !0
-; CHECK-NEXT:    [[VAL:%.*]] = load atomic i32, i32 addrspace(1)* [[GEP]] seq_cst, align 4, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3, !amdgpu.uniform !0
-; CHECK-NEXT:    [[I2:%.*]] = load i32, i32 addrspace(1)* [[I1]], align 4
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2, !amdgpu.uniform !0
+; CHECK-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[GEP]] seq_cst, align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3, !amdgpu.uniform !0
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
 ; CHECK-NEXT:    [[I3:%.*]] = add i32 [[I2]], [[I]]
-; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 4
-; CHECK-NEXT:    store i32 [[I3]], i32 addrspace(1)* [[I4]], align 4
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 4
+; CHECK-NEXT:    store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
-  %i = load i32, i32 addrspace(1)* %arg, align 4
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
-  %val = load atomic i32, i32 addrspace(1)* %gep  seq_cst, align 4
-  %i1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
-  %i2 = load i32, i32 addrspace(1)* %i1, align 4
+  %i = load i32, ptr addrspace(1) %arg, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
+  %val = load atomic i32, ptr addrspace(1) %gep  seq_cst, align 4
+  %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
+  %i2 = load i32, ptr addrspace(1) %i1, align 4
   %i3 = add i32 %i2, %i
-  %i4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4
-  store i32 %i3, i32 addrspace(1)* %i4, align 4
+  %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 4
+  store i32 %i3, ptr addrspace(1) %i4, align 4
   ret void
 }
 
@@ -346,24 +346,24 @@ bb:
 ; GCN: s_load_dword s
 ; GCN-NOT: global_load_dword
 ; GCN: global_store_dword
-define protected amdgpu_kernel void @no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define protected amdgpu_kernel void @no_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @no_alias_store(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 0, i32 addrspace(3)* @LDS, align 4
+; CHECK-NEXT:    store i32 0, ptr addrspace(3) @LDS, align 4
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    fence syncscope("workgroup") acquire
-; CHECK-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
-; CHECK-NEXT:    store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  store i32 0, i32 addrspace(3)* @LDS, align 4
+  store i32 0, ptr addrspace(3) @LDS, align 4
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  %ld = load i32, i32 addrspace(1)* %in, align 4
-  store i32 %ld, i32 addrspace(1)* %out, align 4
+  %ld = load i32, ptr addrspace(1) %in, align 4
+  store i32 %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -372,24 +372,24 @@ entry:
 ; GCN: s_barrier
 ; GCN: global_load_dword
 ; GCN: global_store_dword
-define protected amdgpu_kernel void @may_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define protected amdgpu_kernel void @may_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @may_alias_store(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    fence syncscope("workgroup") acquire
-; CHECK-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4
-; CHECK-NEXT:    store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  store i32 0, i32 addrspace(1)* %out, align 4
+  store i32 0, ptr addrspace(1) %out, align 4
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  %ld = load i32, i32 addrspace(1)* %in, align 4
-  store i32 %ld, i32 addrspace(1)* %out, align 4
+  %ld = load i32, ptr addrspace(1) %in, align 4
+  store i32 %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -399,24 +399,24 @@ entry:
 ; GCN: s_load_dword s
 ; GCN-NOT: global_load_dword
 ; GCN: global_store_dword
-define protected amdgpu_kernel void @no_alias_volatile_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define protected amdgpu_kernel void @no_alias_volatile_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @no_alias_volatile_store(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store volatile i32 0, i32 addrspace(3)* @LDS, align 4
+; CHECK-NEXT:    store volatile i32 0, ptr addrspace(3) @LDS, align 4
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    fence syncscope("workgroup") acquire
-; CHECK-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
-; CHECK-NEXT:    store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  store volatile i32 0, i32 addrspace(3)* @LDS, align 4
+  store volatile i32 0, ptr addrspace(3) @LDS, align 4
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  %ld = load i32, i32 addrspace(1)* %in, align 4
-  store i32 %ld, i32 addrspace(1)* %out, align 4
+  %ld = load i32, ptr addrspace(1) %in, align 4
+  store i32 %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -425,18 +425,18 @@ entry:
 ; GCN: s_load_dword s
 ; GCN-NOT: global_load_dword
 ; GCN: global_store_dword
-define protected amdgpu_kernel void @no_alias_atomic_rmw_relaxed(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define protected amdgpu_kernel void @no_alias_atomic_rmw_relaxed(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @no_alias_atomic_rmw_relaxed(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 monotonic, align 4
-; CHECK-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
-; CHECK-NEXT:    store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 monotonic, align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 monotonic
-  %ld = load i32, i32 addrspace(1)* %in, align 4
-  store i32 %ld, i32 addrspace(1)* %out, align 4
+  %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 monotonic
+  %ld = load i32, ptr addrspace(1) %in, align 4
+  store i32 %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -445,24 +445,24 @@ entry:
 ; GCN: s_load_dword s
 ; GCN-NOT: global_load_dword
 ; GCN: global_store_dword
-define protected amdgpu_kernel void @no_alias_atomic_cmpxchg(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %swap) {
+define protected amdgpu_kernel void @no_alias_atomic_cmpxchg(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %swap) {
 ; CHECK-LABEL: @no_alias_atomic_cmpxchg(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[UNUSED:%.*]] = cmpxchg i32 addrspace(3)* @LDS, i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
+; CHECK-NEXT:    [[UNUSED:%.*]] = cmpxchg ptr addrspace(3) @LDS, i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    fence syncscope("workgroup") acquire
-; CHECK-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
-; CHECK-NEXT:    store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %unused = cmpxchg i32 addrspace(3)* @LDS, i32 7, i32 %swap seq_cst monotonic
+  %unused = cmpxchg ptr addrspace(3) @LDS, i32 7, i32 %swap seq_cst monotonic
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  %ld = load i32, i32 addrspace(1)* %in, align 4
-  store i32 %ld, i32 addrspace(1)* %out, align 4
+  %ld = load i32, ptr addrspace(1) %in, align 4
+  store i32 %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -471,24 +471,24 @@ entry:
 ; GCN: s_load_dword s
 ; GCN-NOT: global_load_dword
 ; GCN: global_store_dword
-define protected amdgpu_kernel void @no_alias_atomic_rmw(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define protected amdgpu_kernel void @no_alias_atomic_rmw(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @no_alias_atomic_rmw(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4
+; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    fence syncscope("workgroup") acquire
-; CHECK-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
-; CHECK-NEXT:    store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst
+  %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  %ld = load i32, i32 addrspace(1)* %in, align 4
-  store i32 %ld, i32 addrspace(1)* %out, align 4
+  %ld = load i32, ptr addrspace(1) %in, align 4
+  store i32 %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -496,24 +496,24 @@ entry:
 ; GCN: global_atomic_cmpswap
 ; GCN: global_load_dword
 ; GCN: global_store_dword
-define protected amdgpu_kernel void @may_alias_atomic_cmpxchg(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %swap) {
+define protected amdgpu_kernel void @may_alias_atomic_cmpxchg(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %swap) {
 ; CHECK-LABEL: @may_alias_atomic_cmpxchg(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[UNUSED:%.*]] = cmpxchg i32 addrspace(1)* [[OUT:%.*]], i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
+; CHECK-NEXT:    [[UNUSED:%.*]] = cmpxchg ptr addrspace(1) [[OUT:%.*]], i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    fence syncscope("workgroup") acquire
-; CHECK-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4
-; CHECK-NEXT:    store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %unused = cmpxchg i32 addrspace(1)* %out, i32 7, i32 %swap seq_cst monotonic
+  %unused = cmpxchg ptr addrspace(1) %out, i32 7, i32 %swap seq_cst monotonic
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  %ld = load i32, i32 addrspace(1)* %in, align 4
-  store i32 %ld, i32 addrspace(1)* %out, align 4
+  %ld = load i32, ptr addrspace(1) %in, align 4
+  store i32 %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -521,24 +521,24 @@ entry:
 ; GCN: global_atomic_add
 ; GCN: global_load_dword
 ; GCN: global_store_dword
-define protected amdgpu_kernel void @may_alias_atomic_rmw(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define protected amdgpu_kernel void @may_alias_atomic_rmw(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; CHECK-LABEL: @may_alias_atomic_rmw(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add i32 addrspace(1)* [[OUT:%.*]], i32 5 seq_cst, align 4
+; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(1) [[OUT:%.*]], i32 5 seq_cst, align 4
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    fence syncscope("workgroup") acquire
-; CHECK-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4
-; CHECK-NEXT:    store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %unused = atomicrmw add i32 addrspace(1)* %out, i32 5 seq_cst
+  %unused = atomicrmw add ptr addrspace(1) %out, i32 5 seq_cst
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  %ld = load i32, i32 addrspace(1)* %in, align 4
-  store i32 %ld, i32 addrspace(1)* %out, align 4
+  %ld = load i32, ptr addrspace(1) %in, align 4
+  store i32 %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -548,28 +548,28 @@ entry:
 ; GCN: ds_add_u32
 ; GCN: global_load_dword
 ; GCN: global_store_dword
-define protected amdgpu_kernel void @no_alias_atomic_rmw_then_clobber(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 addrspace(1)* noalias %noalias) {
+define protected amdgpu_kernel void @no_alias_atomic_rmw_then_clobber(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(1) noalias %noalias) {
 ; CHECK-LABEL: @no_alias_atomic_rmw_then_clobber(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 1, i32 addrspace(1)* [[OUT:%.*]], align 4
-; CHECK-NEXT:    store i32 2, i32 addrspace(1)* [[NOALIAS:%.*]], align 4
-; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i32 2, ptr addrspace(1) [[NOALIAS:%.*]], align 4
+; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    fence syncscope("workgroup") acquire
-; CHECK-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4
-; CHECK-NEXT:    store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
+; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  store i32 1, i32 addrspace(1)* %out, align 4
-  store i32 2, i32 addrspace(1)* %noalias, align 4
-  %unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst
+  store i32 1, ptr addrspace(1) %out, align 4
+  store i32 2, ptr addrspace(1) %noalias, align 4
+  %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  %ld = load i32, i32 addrspace(1)* %in, align 4
-  store i32 %ld, i32 addrspace(1)* %out, align 4
+  %ld = load i32, ptr addrspace(1) %in, align 4
+  store i32 %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -579,26 +579,26 @@ entry:
 ; GCN: s_load_dword s
 ; GCN-NOT: global_load_dword
 ; GCN: global_store_dword
-define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 addrspace(1)* noalias %noalias) {
+define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(1) noalias %noalias) {
 ; CHECK-LABEL: @no_alias_atomic_rmw_then_no_alias_store(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i32 2, i32 addrspace(1)* [[NOALIAS:%.*]], align 4
-; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4
+; CHECK-NEXT:    store i32 2, ptr addrspace(1) [[NOALIAS:%.*]], align 4
+; CHECK-NEXT:    [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4
 ; CHECK-NEXT:    fence syncscope("workgroup") release
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    fence syncscope("workgroup") acquire
-; CHECK-NEXT:    [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0
-; CHECK-NEXT:    store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  store i32 2, i32 addrspace(1)* %noalias, align 4
-  %unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst
+  store i32 2, ptr addrspace(1) %noalias, align 4
+  %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst
   fence syncscope("workgroup") release
   tail call void @llvm.amdgcn.s.barrier()
   fence syncscope("workgroup") acquire
-  %ld = load i32, i32 addrspace(1)* %in, align 4
-  store i32 %ld, i32 addrspace(1)* %out, align 4
+  %ld = load i32, ptr addrspace(1) %in, align 4
+  store i32 %ld, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/nop-data.ll b/llvm/test/CodeGen/AMDGPU/nop-data.ll
index 691279f933e5..903602fbcfb3 100644
--- a/llvm/test/CodeGen/AMDGPU/nop-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/nop-data.ll
@@ -81,7 +81,7 @@ entry:
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <kernel1>:
 ; CHECK: s_endpgm
-define amdgpu_kernel void @kernel1(i32 addrspace(1)* addrspace(4)* %ptr.out) align 256 {
+define amdgpu_kernel void @kernel1(ptr addrspace(4) %ptr.out) align 256 {
 entry:
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/nullptr.ll b/llvm/test/CodeGen/AMDGPU/nullptr.ll
index 16292f0ebee0..ed3d98dff154 100644
--- a/llvm/test/CodeGen/AMDGPU/nullptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/nullptr.ll
@@ -1,101 +1,101 @@
 ;RUN: llc < %s -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs | FileCheck -check-prefixes=CHECK,GCN %s
 ;RUN: llc < %s -march=r600 -mtriple=r600-- -verify-machineinstrs | FileCheck -check-prefixes=CHECK,R600 %s
 
-%struct.S = type { i32 addrspace(5)*, i32 addrspace(1)*, i32 addrspace(4)*, i32 addrspace(3)*, i32*, i32 addrspace(2)*}
+%struct.S = type { ptr addrspace(5), ptr addrspace(1), ptr addrspace(4), ptr addrspace(3), ptr, ptr addrspace(2)}
 
 ; CHECK-LABEL: nullptr_priv:
 ; CHECK-NEXT: .long -1
- at nullptr_priv = global i32 addrspace(5)* addrspacecast (i32* null to i32 addrspace(5)*)
+ at nullptr_priv = global ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5))
 
 ; CHECK-LABEL: nullptr_glob:
 ; GCN-NEXT: .quad 0
 ; R600-NEXT: .long 0
- at nullptr_glob = global i32 addrspace(1)* addrspacecast (i32* null to i32 addrspace(1)*)
+ at nullptr_glob = global ptr addrspace(1) addrspacecast (ptr null to ptr addrspace(1))
 
 ; CHECK-LABEL: nullptr_const:
 ; GCN-NEXT: .quad 0
 ; R600-NEXT: .long 0
- at nullptr_const = global i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*)
+ at nullptr_const = global ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4))
 
 ; CHECK-LABEL: nullptr_local:
 ; CHECK-NEXT: .long -1
- at nullptr_local = global i32 addrspace(3)* addrspacecast (i32* null to i32 addrspace(3)*)
+ at nullptr_local = global ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3))
 
 ; CHECK-LABEL: nullptr_region:
 ; CHECK-NEXT: .long -1
- at nullptr_region = global i32 addrspace(2)* addrspacecast (i32* null to i32 addrspace(2)*)
+ at nullptr_region = global ptr addrspace(2) addrspacecast (ptr null to ptr addrspace(2))
 
 ; CHECK-LABEL: nullptr6:
 ; R600-NEXT: .long 0
- at nullptr6 = global i32 addrspace(6)* addrspacecast (i32* null to i32 addrspace(6)*)
+ at nullptr6 = global ptr addrspace(6) addrspacecast (ptr null to ptr addrspace(6))
 
 ; CHECK-LABEL: nullptr7:
 ; R600-NEXT: .long 0
- at nullptr7 = global i32 addrspace(7)* addrspacecast (i32* null to i32 addrspace(7)*)
+ at nullptr7 = global ptr addrspace(7) addrspacecast (ptr null to ptr addrspace(7))
 
 ; CHECK-LABEL: nullptr8:
 ; R600-NEXT: .long 0
- at nullptr8 = global i32 addrspace(8)* addrspacecast (i32* null to i32 addrspace(8)*)
+ at nullptr8 = global ptr addrspace(8) addrspacecast (ptr null to ptr addrspace(8))
 
 ; CHECK-LABEL: nullptr9:
 ; R600-NEXT: .long 0
- at nullptr9 = global i32 addrspace(9)* addrspacecast (i32* null to i32 addrspace(9)*)
+ at nullptr9 = global ptr addrspace(9) addrspacecast (ptr null to ptr addrspace(9))
 
 ; CHECK-LABEL: nullptr10:
 ; R600-NEXT: .long 0
- at nullptr10 = global i32 addrspace(10)* addrspacecast (i32* null to i32 addrspace(10)*)
+ at nullptr10 = global ptr addrspace(10) addrspacecast (ptr null to ptr addrspace(10))
 
 ; CHECK-LABEL: nullptr11:
 ; R600-NEXT: .long 0
- at nullptr11 = global i32 addrspace(11)* addrspacecast (i32* null to i32 addrspace(11)*)
+ at nullptr11 = global ptr addrspace(11) addrspacecast (ptr null to ptr addrspace(11))
 
 ; CHECK-LABEL: nullptr12:
 ; R600-NEXT: .long 0
- at nullptr12 = global i32 addrspace(12)* addrspacecast (i32* null to i32 addrspace(12)*)
+ at nullptr12 = global ptr addrspace(12) addrspacecast (ptr null to ptr addrspace(12))
 
 ; CHECK-LABEL: nullptr13:
 ; R600-NEXT: .long 0
- at nullptr13 = global i32 addrspace(13)* addrspacecast (i32* null to i32 addrspace(13)*)
+ at nullptr13 = global ptr addrspace(13) addrspacecast (ptr null to ptr addrspace(13))
 
 ; CHECK-LABEL: nullptr14:
 ; R600-NEXT: .long 0
- at nullptr14 = global i32 addrspace(14)* addrspacecast (i32* null to i32 addrspace(14)*)
+ at nullptr14 = global ptr addrspace(14) addrspacecast (ptr null to ptr addrspace(14))
 
 ; CHECK-LABEL: nullptr15:
 ; R600-NEXT: .long 0
- at nullptr15 = global i32 addrspace(15)* addrspacecast (i32* null to i32 addrspace(15)*)
+ at nullptr15 = global ptr addrspace(15) addrspacecast (ptr null to ptr addrspace(15))
 
 ; CHECK-LABEL: nullptr16:
 ; R600-NEXT: .long 0
- at nullptr16 = global i32 addrspace(16)* addrspacecast (i32* null to i32 addrspace(16)*)
+ at nullptr16 = global ptr addrspace(16) addrspacecast (ptr null to ptr addrspace(16))
 
 ; CHECK-LABEL: nullptr17:
 ; R600-NEXT: .long 0
- at nullptr17 = global i32 addrspace(17)* addrspacecast (i32* null to i32 addrspace(17)*)
+ at nullptr17 = global ptr addrspace(17) addrspacecast (ptr null to ptr addrspace(17))
 
 ; CHECK-LABEL: nullptr18:
 ; R600-NEXT: .long 0
- at nullptr18 = global i32 addrspace(18)* addrspacecast (i32* null to i32 addrspace(18)*)
+ at nullptr18 = global ptr addrspace(18) addrspacecast (ptr null to ptr addrspace(18))
 
 ; CHECK-LABEL: nullptr19:
 ; R600-NEXT: .long 0
- at nullptr19 = global i32 addrspace(19)* addrspacecast (i32* null to i32 addrspace(19)*)
+ at nullptr19 = global ptr addrspace(19) addrspacecast (ptr null to ptr addrspace(19))
 
 ; CHECK-LABEL: nullptr20:
 ; R600-NEXT: .long 0
- at nullptr20 = global i32 addrspace(20)* addrspacecast (i32* null to i32 addrspace(20)*)
+ at nullptr20 = global ptr addrspace(20) addrspacecast (ptr null to ptr addrspace(20))
 
 ; CHECK-LABEL: nullptr21:
 ; R600-NEXT: .long 0
- at nullptr21 = global i32 addrspace(21)* addrspacecast (i32* null to i32 addrspace(21)*)
+ at nullptr21 = global ptr addrspace(21) addrspacecast (ptr null to ptr addrspace(21))
 
 ; CHECK-LABEL: nullptr22:
 ; R600-NEXT: .long 0
- at nullptr22 = global i32 addrspace(22)* addrspacecast (i32* null to i32 addrspace(22)*)
+ at nullptr22 = global ptr addrspace(22) addrspacecast (ptr null to ptr addrspace(22))
 
 ; CHECK-LABEL: nullptr23:
 ; R600-NEXT: .long 0
- at nullptr23 = global i32 addrspace(23)* addrspacecast (i32* null to i32 addrspace(23)*)
+ at nullptr23 = global ptr addrspace(23) addrspacecast (ptr null to ptr addrspace(23))
 
 ; CHECK-LABEL: structWithPointers:
 ; CHECK-NEXT: .long -1
@@ -111,9 +111,9 @@
 ; CHECK-NEXT: .long -1
 ; GCN-NEXT:   .zero 4
 @structWithPointers = addrspace(1) global %struct.S {
-  i32 addrspace(5)* addrspacecast (i32* null to i32 addrspace(5)*),
-  i32 addrspace(1)* addrspacecast (i32* null to i32 addrspace(1)*),
-  i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*),
-  i32 addrspace(3)* addrspacecast (i32* null to i32 addrspace(3)*),
-  i32* null,
-  i32 addrspace(2)* addrspacecast (i32* null to i32 addrspace(2)*)}, align 4
+  ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5)),
+  ptr addrspace(1) addrspacecast (ptr null to ptr addrspace(1)),
+  ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)),
+  ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)),
+  ptr null,
+  ptr addrspace(2) addrspacecast (ptr null to ptr addrspace(2))}, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
index 07f8ee2255dc..6ef70f45bd50 100644
--- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
+++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -334,8 +334,7 @@ define amdgpu_kernel void @used_101_sgprs() {
 ; GFX1100:    ; Occupancy: 16
 @lds6552 = internal addrspace(3) global [6552 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_6552() {
-  %p = bitcast [6552 x i8] addrspace(3)* @lds6552 to i8 addrspace(3)*
-  store volatile i8 1, i8 addrspace(3)* %p
+  store volatile i8 1, ptr addrspace(3) @lds6552
   ret void
 }
 
@@ -346,8 +345,7 @@ define amdgpu_kernel void @used_lds_6552() {
 ; GFX1100:    ; Occupancy: 16
 @lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_6556() {
-  %p = bitcast [6556 x i8] addrspace(3)* @lds6556 to i8 addrspace(3)*
-  store volatile i8 1, i8 addrspace(3)* %p
+  store volatile i8 1, ptr addrspace(3) @lds6556
   ret void
 }
 
@@ -358,8 +356,7 @@ define amdgpu_kernel void @used_lds_6556() {
 ; GFX1100:    ; Occupancy: 16
 @lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_13112() {
-  %p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)*
-  store volatile i8 1, i8 addrspace(3)* %p
+  store volatile i8 1, ptr addrspace(3) @lds13112
   ret void
 }
 
@@ -371,8 +368,7 @@ define amdgpu_kernel void @used_lds_13112() {
 ; GFX1100W32: ; Occupancy: 14{{$}}
 @lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4
 define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
-  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
-  store volatile i8 1, i8 addrspace(3)* %p
+  store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 
@@ -384,8 +380,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
 ; GFX1100W64: ; Occupancy: 14{{$}}
 ; GFX1100W32: ; Occupancy: 16{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
-  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
-  store volatile i8 1, i8 addrspace(3)* %p
+  store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 
@@ -397,8 +392,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
 ; GFX1100W64: ; Occupancy: 14{{$}}
 ; GFX1100W32: ; Occupancy: 16{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
-  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
-  store volatile i8 1, i8 addrspace(3)* %p
+  store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 
@@ -408,8 +402,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
 ; GFX1030:    ; Occupancy: 16{{$}}
 ; GFX1100:    ; Occupancy: 16{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
-  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
-  store volatile i8 1, i8 addrspace(3)* %p
+  store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 
@@ -419,8 +412,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
 ; GFX1030:    ; Occupancy: 16{{$}}
 ; GFX1100:    ; Occupancy: 16{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
-  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
-  store volatile i8 1, i8 addrspace(3)* %p
+  store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 
@@ -430,8 +422,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
 ; GFX1030:    ; Occupancy: 16{{$}}
 ; GFX1100:    ; Occupancy: 16{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
-  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
-  store volatile i8 1, i8 addrspace(3)* %p
+  store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 
@@ -441,8 +432,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
 ; GFX1030:    ; Occupancy: 16{{$}}
 ; GFX1100:    ; Occupancy: 16{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
-  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
-  store volatile i8 1, i8 addrspace(3)* %p
+  store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 
@@ -451,8 +441,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
 ; GFX10:      ; Occupancy: 7{{$}}
 ; GFX1100:    ; Occupancy: 7{{$}}
 define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
-  %p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
-  store volatile i8 1, i8 addrspace(3)* %p
+  store volatile i8 1, ptr addrspace(3) @lds8252
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll
index 3d6fda883169..efc443c5cb13 100644
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@@ -3,7 +3,7 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=VI %s
 
 ; IEEE bit enabled for compute kernel, so shouldn't use.
-define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
+define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
 ; SI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -39,17 +39,17 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrsp
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 0.5
-  store float %div2, float addrspace(1)* %out.gep
+  store float %div2, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; IEEE bit enabled for compute kernel, so shouldn't use.
-define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(double addrspace(1)* %out, double addrspace(1)* %aptr) #4 {
+define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
 ; SI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -85,17 +85,17 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(double addrs
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
-  %a = load double, double addrspace(1)* %gep0
+  %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
+  %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
+  %a = load double, ptr addrspace(1) %gep0
   %add = fadd double %a, 1.0
   %div2 = fmul double %add, 0.5
-  store double %div2, double addrspace(1)* %out.gep
+  store double %div2, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed
-define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
 ; SI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -131,17 +131,17 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
-  %a = load float, float addrspace(1)* %gep0
+  %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
+  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %a = load float, ptr addrspace(1) %gep0
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 0.5
-  store float %div2, float addrspace(1)* %out.gep
+  store float %div2, ptr addrspace(1) %out.gep
   ret void
 }
 
 ; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed.
-define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(double addrspace(1)* %out, double addrspace(1)* %aptr) #5 {
+define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #5 {
 ; SI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -177,12 +177,12 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(double addrspace(1)*
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
-  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
-  %a = load double, double addrspace(1)* %gep0
+  %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
+  %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
+  %a = load double, ptr addrspace(1) %gep0
   %add = fadd double %a, 1.0
   %div2 = fmul double %add, 0.5
-  store double %div2, double addrspace(1)* %out.gep
+  store double %div2, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -205,7 +205,7 @@ define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {
 ; VI-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 0.5
-  store float %div2, float addrspace(1)* undef
+  store float %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -228,7 +228,7 @@ define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 {
 ; VI-NEXT:    s_endpgm
   %add = fadd double %a, 1.0
   %div2 = fmul double %add, 0.5
-  store double %div2, double addrspace(1)* undef
+  store double %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -248,7 +248,7 @@ define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {
 ; VI-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 0.5
-  store float %div2, float addrspace(1)* undef
+  store float %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -268,7 +268,7 @@ define amdgpu_ps void @v_omod_div2_f64(double %a) #5 {
 ; VI-NEXT:    s_endpgm
   %add = fadd nsz double %a, 1.0
   %div2 = fmul nsz double %add, 0.5
-  store double %div2, double addrspace(1)* undef
+  store double %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -288,7 +288,7 @@ define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
 ; VI-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 2.0
-  store float %div2, float addrspace(1)* undef
+  store float %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -308,7 +308,7 @@ define amdgpu_ps void @v_omod_mul2_f64(double %a) #5 {
 ; VI-NEXT:    s_endpgm
   %add = fadd nsz double %a, 1.0
   %div2 = fmul nsz double %add, 2.0
-  store double %div2, double addrspace(1)* undef
+  store double %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -328,7 +328,7 @@ define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
 ; VI-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 4.0
-  store float %div2, float addrspace(1)* undef
+  store float %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -348,7 +348,7 @@ define amdgpu_ps void @v_omod_mul4_f64(double %a) #5 {
 ; VI-NEXT:    s_endpgm
   %add = fadd nsz double %a, 1.0
   %div2 = fmul nsz double %add, 4.0
-  store double %div2, double addrspace(1)* undef
+  store double %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -374,8 +374,8 @@ define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
 ; VI-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 4.0
-  store float %div2, float addrspace(1)* undef
-  store volatile float %add, float addrspace(1)* undef
+  store float %div2, ptr addrspace(1) undef
+  store volatile float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -396,7 +396,7 @@ define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 {
   %add = fadd float %a, 1.0
   call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
   %div2 = fmul float %add, 4.0
-  store float %div2, float addrspace(1)* undef
+  store float %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -420,7 +420,7 @@ define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 {
 
   %max = call float @llvm.maxnum.f32(float %div2, float 0.0)
   %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
-  store float %clamp, float addrspace(1)* undef
+  store float %clamp, ptr addrspace(1) undef
   ret void
 }
 
@@ -445,7 +445,7 @@ define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 {
   %max = call float @llvm.maxnum.f32(float %add, float 0.0)
   %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
   %div2 = fmul float %clamp, 0.5
-  store float %div2, float addrspace(1)* undef
+  store float %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -468,7 +468,7 @@ define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 {
   %add = fadd float %a, 1.0
   %abs.add = call float @llvm.fabs.f32(float %add)
   %div2 = fmul float %abs.add, 0.5
-  store float %div2, float addrspace(1)* undef
+  store float %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -489,7 +489,7 @@ define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 {
   %add = fadd float %a, %a
   %max = call float @llvm.maxnum.f32(float %add, float 0.0)
   %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
-  store float %clamp, float addrspace(1)* undef
+  store float %clamp, ptr addrspace(1) undef
   ret void
 }
 
@@ -512,7 +512,7 @@ define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 {
   %max = call float @llvm.maxnum.f32(float %a, float 0.0)
   %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
   %add = fadd float %clamp, %clamp
-  store float %add, float addrspace(1)* undef
+  store float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -535,7 +535,7 @@ define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 {
   %x = fadd float %a, 1.0
   %abs.x = call float @llvm.fabs.f32(float %x)
   %add = fadd float %abs.x, %abs.x
-  store float %add, float addrspace(1)* undef
+  store float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -558,7 +558,7 @@ define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 {
   %x = fadd float %a, 1.0
   %abs.x = call float @llvm.fabs.f32(float %x)
   %add = fadd float %abs.x, %x
-  store float %add, float addrspace(1)* undef
+  store float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -581,7 +581,7 @@ define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 {
   %x = fadd float %a, 1.0
   %abs.x = call float @llvm.fabs.f32(float %x)
   %add = fadd float %x, %abs.x
-  store float %add, float addrspace(1)* undef
+  store float %add, ptr addrspace(1) undef
   ret void
 }
 
@@ -605,7 +605,7 @@ define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 {
   %add = fadd float %a, 1.0
   %div2.0 = fmul float %add, 0.5
   %div2.1 = fmul float %div2.0, 0.5
-  store float %div2.1, float addrspace(1)* undef
+  store float %div2.1, ptr addrspace(1) undef
   ret void
 }
 
@@ -628,7 +628,7 @@ define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {
 ; VI-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %div2 = fmul float %add, 0.5
-  store float %div2, float addrspace(1)* undef
+  store float %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -651,7 +651,7 @@ define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 {
 ; VI-NEXT:    s_endpgm
   %add = fadd double %a, 1.0
   %div2 = fmul double %add, 0.5
-  store double %div2, double addrspace(1)* undef
+  store double %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -674,7 +674,7 @@ define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {
 ; VI-NEXT:    s_endpgm
   %add = fadd float %a, 1.0
   %mul2 = fadd float %add, %add
-  store float %mul2, float addrspace(1)* undef
+  store float %mul2, ptr addrspace(1) undef
   ret void
 }
 
@@ -697,7 +697,7 @@ define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 {
 ; VI-NEXT:    s_endpgm
   %add = fadd double %a, 1.0
   %mul2 = fadd double %add, %add
-  store double %mul2, double addrspace(1)* undef
+  store double %mul2, ptr addrspace(1) undef
   ret void
 }
 
@@ -722,7 +722,7 @@ define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
 ; VI-NEXT:    s_endpgm
   %add = fadd half %a, 1.0
   %div2 = fmul half %add, 0.5
-  store half %div2, half addrspace(1)* undef
+  store half %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -747,7 +747,7 @@ define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
 ; VI-NEXT:    s_endpgm
   %add = fadd half %a, 1.0
   %mul2 = fadd half %add, %add
-  store half %mul2, half addrspace(1)* undef
+  store half %mul2, ptr addrspace(1) undef
   ret void
 }
 
@@ -770,7 +770,7 @@ define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
 ; VI-NEXT:    s_endpgm
   %add = fadd half %a, 1.0
   %div2 = fmul half %add, 0.5
-  store half %div2, half addrspace(1)* undef
+  store half %div2, ptr addrspace(1) undef
   ret void
 }
 
@@ -794,7 +794,7 @@ define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 {
   %add = fadd float %mul, %b
   %mad = fmul float %add, 2.0
   %res = fmul float %mad, %b
-  store float %res, float addrspace(1)* undef
+  store float %res, ptr addrspace(1) undef
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
index a6af2f0ab68e..5fe2364db16c 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
@@ -15,8 +15,8 @@
 ; VI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
 
 ; GCN: buffer_store_dword [[RESULT]],
-define amdgpu_kernel void @add_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) {
+define amdgpu_kernel void @add_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b) {
   %result = fadd float %a, %b
-  store float %result, float addrspace(1)* %out
+  store float %result, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
index 28be3738c658..c6c85f10ff4a 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @if_masked_1(i32 %arg, i32 addrspace(1)* %p)  {
+define amdgpu_kernel void @if_masked_1(i32 %arg, ptr addrspace(1) %p)  {
 ; GCN-LABEL: if_masked_1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0x24
@@ -16,11 +16,11 @@ define amdgpu_kernel void @if_masked_1(i32 %arg, i32 addrspace(1)* %p)  {
   %and = and i32 %arg, 1
   %cmp = icmp eq i32 %and, 0
   %sel = select i1 %cmp, i32 22, i32 33
-  store i32 %sel, i32 addrspace(1)* %p
+  store i32 %sel, ptr addrspace(1) %p
   ret void
 }
 
-define amdgpu_kernel void @if_masked_1024(i32 %arg, i32 addrspace(1)* %p)  {
+define amdgpu_kernel void @if_masked_1024(i32 %arg, ptr addrspace(1) %p)  {
 ; GCN-LABEL: if_masked_1024:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0x24
@@ -35,11 +35,11 @@ define amdgpu_kernel void @if_masked_1024(i32 %arg, i32 addrspace(1)* %p)  {
   %and = and i32 %arg, 1024
   %cmp = icmp eq i32 %and, 0
   %sel = select i1 %cmp, i32 22, i32 33
-  store i32 %sel, i32 addrspace(1)* %p
+  store i32 %sel, ptr addrspace(1) %p
   ret void
 }
 
-define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, i32 addrspace(1)* %p)  {
+define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p)  {
 ; GCN-LABEL: if_masked_0x80000000:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0x24
@@ -54,12 +54,12 @@ define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, i32 addrspace(1)* %p)
   %and = and i32 %arg, 2147483648
   %cmp = icmp eq i32 %and, 0
   %sel = select i1 %cmp, i32 22, i32 33
-  store i32 %sel, i32 addrspace(1)* %p
+  store i32 %sel, ptr addrspace(1) %p
   ret void
 }
 
 ; FIXME: this should result in "s_bitcmp0_b64 $arg, 63" or "s_bitcmp0_b32 $arg.sub1, 31"
-define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, i32 addrspace(1)* %p)  {
+define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, ptr addrspace(1) %p)  {
 ; GCN-LABEL: if_masked_0x8000000000000000:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -75,6 +75,6 @@ define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, i32 addrspace(
   %and = and i64 %arg, 9223372036854775808
   %cmp = icmp eq i64 %and, 0
   %sel = select i1 %cmp, i32 22, i32 33
-  store i32 %sel, i32 addrspace(1)* %p
+  store i32 %sel, ptr addrspace(1) %p
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index b4eb682d5c08..33fdfacc1676 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -8,12 +8,12 @@
 ; GCN-NOT: v_cmp
 ; GCN:   s_andn2_b64 vcc, exec, [[CC]]
 ; GCN:   s_cbranch_vccnz .LBB0_2
-define amdgpu_kernel void @negated_cond(i32 addrspace(1)* %arg1) {
+define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 bb:
   br label %bb1
 
 bb1:
-  %tmp1 = load i32, i32 addrspace(1)* %arg1
+  %tmp1 = load i32, ptr addrspace(1) %arg1
   %tmp2 = icmp eq i32 %tmp1, 0
   br label %bb2
 
@@ -28,8 +28,8 @@ bb3:
 
 bb4:
   %tmp6 = phi i32 [ %tmp5, %bb3 ], [ %tmp4, %bb2 ]
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp6
-  store i32 0, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg1, i32 %tmp6
+  store i32 0, ptr addrspace(1) %gep
   %tmp7 = icmp eq i32 %tmp6, 32
   br i1 %tmp7, label %bb1, label %bb2
 }
@@ -47,12 +47,12 @@ bb4:
 ; GCN:   s_mov_b64 vcc, exec
 ; GCN:   s_cbranch_execnz [[BB0]]
 ; GCN: [[BB2]]:
-define amdgpu_kernel void @negated_cond_dominated_blocks(i32 addrspace(1)* %arg1) {
+define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) {
 bb:
   br label %bb2
 
 bb2:
-  %tmp1 = load i32, i32 addrspace(1)* %arg1
+  %tmp1 = load i32, ptr addrspace(1) %arg1
   %tmp2 = icmp eq i32 %tmp1, 0
   br label %bb4
 
@@ -74,8 +74,8 @@ bb6:
 
 bb7:
   %tmp7 = phi i32 [ %tmp5, %bb5 ], [ %tmp6, %bb6 ]
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp7
-  store i32 0, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg1, i32 %tmp7
+  store i32 0, ptr addrspace(1) %gep
   %tmp8 = icmp eq i32 %tmp7, 32
   br i1 %tmp8, label %bb3, label %bb4
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/predicates.ll b/llvm/test/CodeGen/AMDGPU/predicates.ll
index 566b48eb8864..33a6a82bd7db 100644
--- a/llvm/test/CodeGen/AMDGPU/predicates.ll
+++ b/llvm/test/CodeGen/AMDGPU/predicates.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: {{^}}simple_if:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define amdgpu_kernel void @simple_if(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @simple_if(ptr addrspace(1) %out, i32 %in) {
 entry:
   %cmp0 = icmp sgt i32 %in, 0
   br i1 %cmp0, label %IF, label %ENDIF
@@ -17,7 +17,7 @@ IF:
 
 ENDIF:
   %tmp2 = phi i32 [ %in, %entry ], [ %tmp1, %IF ]
-  store i32 %tmp2, i32 addrspace(1)* %out
+  store i32 %tmp2, ptr addrspace(1) %out
   ret void
 }
 
@@ -25,7 +25,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define amdgpu_kernel void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @simple_if_else(ptr addrspace(1) %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF, label %ELSE
@@ -40,7 +40,7 @@ ELSE:
 
 ENDIF:
   %3 = phi i32 [ %1, %IF ], [ %2, %ELSE ]
-  store i32 %3, i32 addrspace(1)* %out
+  store i32 %3, ptr addrspace(1) %out
   ret void
 }
 
@@ -51,7 +51,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define amdgpu_kernel void @nested_if(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @nested_if(ptr addrspace(1) %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF0, label %ENDIF
@@ -67,7 +67,7 @@ IF1:
 
 ENDIF:
   %4 = phi i32 [%in, %entry], [%1, %IF0], [%3, %IF1]
-  store i32 %4, i32 addrspace(1)* %out
+  store i32 %4, ptr addrspace(1) %out
   ret void
 }
 
@@ -79,7 +79,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define amdgpu_kernel void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @nested_if_else(ptr addrspace(1) %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF0, label %ENDIF
@@ -99,6 +99,6 @@ ELSE1:
 
 ENDIF:
   %5 = phi i32 [%in, %entry], [%3, %IF1], [%4, %ELSE1]
-  store i32 %5, i32 addrspace(1)* %out
+  store i32 %5, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
index 17051df841df..c9686e661fe1 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll
@@ -6,73 +6,69 @@
 ; GCN-LABEL: ptr_nest_3:
 ; GCN-COUNT-2: global_load_dwordx2
 ; GCN:         global_store_dword
-define amdgpu_kernel void @ptr_nest_3(float** addrspace(1)* nocapture readonly %Arg) {
+define amdgpu_kernel void @ptr_nest_3(ptr addrspace(1) nocapture readonly %Arg) {
 ; CHECK-LABEL: @ptr_nest_3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]]
-; CHECK-NEXT:    [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
-; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
-; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
+; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2_GLOBAL]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i
-  %p2 = load float**, float** addrspace(1)* %p1, align 8
-  %p3 = load float*, float** %p2, align 8
-  store float 0.000000e+00, float* %p3, align 4
+  %p1 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i32 %i
+  %p2 = load ptr, ptr addrspace(1) %p1, align 8
+  %p3 = load ptr, ptr %p2, align 8
+  store float 0.000000e+00, ptr %p3, align 4
   ret void
 }
 
 ; GCN-LABEL: ptr_bitcast:
 ; GCN: global_load_dwordx2
 ; GCN: global_store_dword
-define amdgpu_kernel void @ptr_bitcast(float** nocapture readonly %Arg) {
+define amdgpu_kernel void @ptr_bitcast(ptr nocapture readonly %Arg) {
 ; CHECK-LABEL: @ptr_bitcast(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
+; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I]]
-; CHECK-NEXT:    [[P1_CAST:%.*]] = bitcast float* addrspace(1)* [[P1]] to i32* addrspace(1)*
-; CHECK-NEXT:    [[P2:%.*]] = load i32*, i32* addrspace(1)* [[P1_CAST]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast i32* [[P2]] to i32 addrspace(1)*
-; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[P2_GLOBAL]], align 4
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I]]
+; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[P2_GLOBAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
-  %p1.cast = bitcast float** %p1 to i32**
-  %p2 = load i32*, i32** %p1.cast, align 8
-  store i32 0, i32* %p2, align 4
+  %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
+  %p2 = load ptr, ptr %p1, align 8
+  store i32 0, ptr %p2, align 4
   ret void
 }
 
-%struct.S = type { float* }
+%struct.S = type { ptr }
 
 ; GCN-LABEL: ptr_in_struct:
 ; GCN: s_load_dwordx2
 ; GCN: global_store_dword
-define amdgpu_kernel void @ptr_in_struct(%struct.S addrspace(1)* nocapture readonly %Arg) {
+define amdgpu_kernel void @ptr_in_struct(ptr addrspace(1) nocapture readonly %Arg) {
 ; CHECK-LABEL: @ptr_in_struct(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], [[STRUCT_S]] addrspace(1)* [[ARG:%.*]], i64 0, i32 0
-; CHECK-NEXT:    [[P1:%.*]] = load float*, float* addrspace(1)* [[P]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[P1_GLOBAL:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)*
+; CHECK-NEXT:    [[P1:%.*]] = load ptr, ptr addrspace(1) [[ARG:%.*]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P1_GLOBAL:%.*]] = addrspacecast ptr [[P1]] to ptr addrspace(1)
 ; CHECK-NEXT:    [[ID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float addrspace(1)* [[P1_GLOBAL]], i32 [[ID]]
-; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[P1_GLOBAL]], i32 [[ID]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %p = getelementptr inbounds %struct.S, %struct.S addrspace(1)* %Arg, i64 0, i32 0
-  %p1 = load float*, float* addrspace(1)* %p, align 8
+  %p1 = load ptr, ptr addrspace(1) %Arg, align 8
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %arrayidx = getelementptr inbounds float, float* %p1, i32 %id
-  store float 0.000000e+00, float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr %p1, i32 %id
+  store float 0.000000e+00, ptr %arrayidx, align 4
   ret void
 }
 
@@ -82,76 +78,76 @@ entry:
 ; GCN-COUNT-2: global_load_dwordx2
 ; GCN:         global_load_dwordx4
 ; GCN:         global_store_dword
-define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, float** nocapture noalias %Out, i32 %X) {
+define amdgpu_kernel void @flat_ptr_arg(ptr nocapture readonly noalias %Arg, ptr nocapture noalias %Out, i32 %X) {
 ; CHECK-LABEL: @flat_ptr_arg(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OUT_GLOBAL:%.*]] = addrspacecast float** [[OUT:%.*]] to float* addrspace(1)*
-; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
+; CHECK-NEXT:    [[OUT_GLOBAL:%.*]] = addrspacecast ptr [[OUT:%.*]] to ptr addrspace(1)
+; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
-; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]]
-; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1
-; CHECK-NEXT:    [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1)
+; CHECK-NEXT:    [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_1]], align 4
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[X]], 1
-; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]]
-; CHECK-NEXT:    store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2
-; CHECK-NEXT:    [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_1]]
+; CHECK-NEXT:    store float [[I3]], ptr addrspace(3) [[ARRAYIDX512_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 2
+; CHECK-NEXT:    [[I4:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_2]], align 4
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[X]], 2
-; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]]
-; CHECK-NEXT:    store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3
-; CHECK-NEXT:    [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_2]]
+; CHECK-NEXT:    store float [[I4]], ptr addrspace(3) [[ARRAYIDX512_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 3
+; CHECK-NEXT:    [[I5:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_3]], align 4
 ; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[X]], 3
-; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]]
-; CHECK-NEXT:    store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_3]]
+; CHECK-NEXT:    store float [[I5]], ptr addrspace(3) [[ARRAYIDX512_3]], align 4
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
-; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
-; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[OUT_GLOBAL]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[I7:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX11]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[I7_GLOBAL:%.*]] = addrspacecast float* [[I7]] to float addrspace(1)*
+; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
+; CHECK-NEXT:    [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[OUT_GLOBAL]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[I7:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX11]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I7_GLOBAL:%.*]] = addrspacecast ptr [[I7]] to ptr addrspace(1)
 ; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I7_GLOBAL]], i64 [[IDXPROM8]]
-; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I7_GLOBAL]], i64 [[IDXPROM8]]
+; CHECK-NEXT:    store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = zext i32 %i to i64
-  %arrayidx10 = getelementptr inbounds float*, float** %Arg, i64 %idxprom
-  %i1 = load float*, float** %arrayidx10, align 8
-  %i2 = load float, float* %i1, align 4
-  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
-  store float %i2, float addrspace(3)* %arrayidx512, align 4
-  %arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1
-  %i3 = load float, float* %arrayidx3.1, align 4
+  %arrayidx10 = getelementptr inbounds ptr, ptr %Arg, i64 %idxprom
+  %i1 = load ptr, ptr %arrayidx10, align 8
+  %i2 = load float, ptr %i1, align 4
+  %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
+  store float %i2, ptr addrspace(3) %arrayidx512, align 4
+  %arrayidx3.1 = getelementptr inbounds float, ptr %i1, i64 1
+  %i3 = load float, ptr %arrayidx3.1, align 4
   %add.1 = add nsw i32 %X, 1
-  %arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1
-  store float %i3, float addrspace(3)* %arrayidx512.1, align 4
-  %arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2
-  %i4 = load float, float* %arrayidx3.2, align 4
+  %arrayidx512.1 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.1
+  store float %i3, ptr addrspace(3) %arrayidx512.1, align 4
+  %arrayidx3.2 = getelementptr inbounds float, ptr %i1, i64 2
+  %i4 = load float, ptr %arrayidx3.2, align 4
   %add.2 = add nsw i32 %X, 2
-  %arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2
-  store float %i4, float addrspace(3)* %arrayidx512.2, align 4
-  %arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3
-  %i5 = load float, float* %arrayidx3.3, align 4
+  %arrayidx512.2 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.2
+  store float %i4, ptr addrspace(3) %arrayidx512.2, align 4
+  %arrayidx3.3 = getelementptr inbounds float, ptr %i1, i64 3
+  %i5 = load float, ptr %arrayidx3.3, align 4
   %add.3 = add nsw i32 %X, 3
-  %arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3
-  store float %i5, float addrspace(3)* %arrayidx512.3, align 4
+  %arrayidx512.3 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.3
+  store float %i5, ptr addrspace(3) %arrayidx512.3, align 4
   %sub = add nsw i32 %X, -1
-  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
-  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
-  %arrayidx11 = getelementptr inbounds float*, float** %Out, i64 %idxprom
-  %i7 = load float*, float** %arrayidx11, align 8
+  %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
+  %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
+  %arrayidx11 = getelementptr inbounds ptr, ptr %Out, i64 %idxprom
+  %i7 = load ptr, ptr %arrayidx11, align 8
   %idxprom8 = sext i32 %X to i64
-  %arrayidx9 = getelementptr inbounds float, float* %i7, i64 %idxprom8
-  store float %i6, float* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds float, ptr %i7, i64 %idxprom8
+  store float %i6, ptr %arrayidx9, align 4
   ret void
 }
 
@@ -159,69 +155,69 @@ entry:
 ; GCN: global_load_dwordx2
 ; GCN: global_load_dwordx4
 ; GCN: global_store_dword
-define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
+define amdgpu_kernel void @global_ptr_arg(ptr addrspace(1) nocapture readonly %Arg, i32 %X) {
 ; CHECK-LABEL: @global_ptr_arg(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
-; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]]
-; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1
-; CHECK-NEXT:    [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1)
+; CHECK-NEXT:    [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X:%.*]]
+; CHECK-NEXT:    store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_1]], align 4
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[X]], 1
-; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]]
-; CHECK-NEXT:    store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2
-; CHECK-NEXT:    [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_1]]
+; CHECK-NEXT:    store float [[I3]], ptr addrspace(3) [[ARRAYIDX512_1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 2
+; CHECK-NEXT:    [[I4:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_2]], align 4
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[X]], 2
-; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]]
-; CHECK-NEXT:    store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3
-; CHECK-NEXT:    [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_2]]
+; CHECK-NEXT:    store float [[I4]], ptr addrspace(3) [[ARRAYIDX512_2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 3
+; CHECK-NEXT:    [[I5:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX3_3]], align 4
 ; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[X]], 3
-; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]]
-; CHECK-NEXT:    store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[ADD_3]]
+; CHECK-NEXT:    store float [[I5]], ptr addrspace(3) [[ARRAYIDX512_3]], align 4
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
-; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
-; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
+; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
+; CHECK-NEXT:    [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
 ; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]]
-; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 [[IDXPROM8]]
+; CHECK-NEXT:    store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = zext i32 %i to i64
-  %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
-  %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
-  %i2 = load float, float* %i1, align 4
-  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
-  store float %i2, float addrspace(3)* %arrayidx512, align 4
-  %arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1
-  %i3 = load float, float* %arrayidx3.1, align 4
+  %arrayidx10 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i64 %idxprom
+  %i1 = load ptr, ptr addrspace(1) %arrayidx10, align 8
+  %i2 = load float, ptr %i1, align 4
+  %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
+  store float %i2, ptr addrspace(3) %arrayidx512, align 4
+  %arrayidx3.1 = getelementptr inbounds float, ptr %i1, i64 1
+  %i3 = load float, ptr %arrayidx3.1, align 4
   %add.1 = add nsw i32 %X, 1
-  %arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1
-  store float %i3, float addrspace(3)* %arrayidx512.1, align 4
-  %arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2
-  %i4 = load float, float* %arrayidx3.2, align 4
+  %arrayidx512.1 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.1
+  store float %i3, ptr addrspace(3) %arrayidx512.1, align 4
+  %arrayidx3.2 = getelementptr inbounds float, ptr %i1, i64 2
+  %i4 = load float, ptr %arrayidx3.2, align 4
   %add.2 = add nsw i32 %X, 2
-  %arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2
-  store float %i4, float addrspace(3)* %arrayidx512.2, align 4
-  %arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3
-  %i5 = load float, float* %arrayidx3.3, align 4
+  %arrayidx512.2 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.2
+  store float %i4, ptr addrspace(3) %arrayidx512.2, align 4
+  %arrayidx3.3 = getelementptr inbounds float, ptr %i1, i64 3
+  %i5 = load float, ptr %arrayidx3.3, align 4
   %add.3 = add nsw i32 %X, 3
-  %arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3
-  store float %i5, float addrspace(3)* %arrayidx512.3, align 4
+  %arrayidx512.3 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %add.3
+  store float %i5, ptr addrspace(3) %arrayidx512.3, align 4
   %sub = add nsw i32 %X, -1
-  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
-  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
+  %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
+  %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
   %idxprom8 = sext i32 %X to i64
-  %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
-  store float %i6, float* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds float, ptr %i1, i64 %idxprom8
+  store float %i6, ptr %arrayidx9, align 4
   ret void
 }
 
@@ -230,42 +226,42 @@ entry:
 ; GCN: global_load_dwordx2
 ; GCN: flat_load_dword
 ; GCN: flat_store_dword
-define amdgpu_kernel void @global_ptr_arg_clobbered(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
+define amdgpu_kernel void @global_ptr_arg_clobbered(ptr addrspace(1) nocapture readonly %Arg, i32 %X) {
 ; CHECK-LABEL: @global_ptr_arg_clobbered(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]]
-; CHECK-NEXT:    store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4
-; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
-; CHECK-NEXT:    [[I2:%.*]] = load float, float* [[I1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]]
-; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARRAYIDX10]], i32 [[X:%.*]]
+; CHECK-NEXT:    store ptr null, ptr addrspace(1) [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8
+; CHECK-NEXT:    [[I2:%.*]] = load float, ptr [[I1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X]]
+; CHECK-NEXT:    store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
-; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
-; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
+; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
+; CHECK-NEXT:    [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
 ; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[I1]], i64 [[IDXPROM8]]
-; CHECK-NEXT:    store float [[I6]], float* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[I1]], i64 [[IDXPROM8]]
+; CHECK-NEXT:    store float [[I6]], ptr [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = zext i32 %i to i64
-  %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
-  %arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X
-  store float* null, float* addrspace(1)* %arrayidx11, align 4
-  %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
-  %i2 = load float, float* %i1, align 4
-  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
-  store float %i2, float addrspace(3)* %arrayidx512, align 4
+  %arrayidx10 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i64 %idxprom
+  %arrayidx11 = getelementptr inbounds ptr, ptr addrspace(1) %arrayidx10, i32 %X
+  store ptr null, ptr addrspace(1) %arrayidx11, align 4
+  %i1 = load ptr, ptr addrspace(1) %arrayidx10, align 8
+  %i2 = load float, ptr %i1, align 4
+  %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
+  store float %i2, ptr addrspace(3) %arrayidx512, align 4
   %sub = add nsw i32 %X, -1
-  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
-  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
+  %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
+  %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
   %idxprom8 = sext i32 %X to i64
-  %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
-  store float %i6, float* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds float, ptr %i1, i64 %idxprom8
+  store float %i6, ptr %arrayidx9, align 4
   ret void
 }
 
@@ -274,69 +270,69 @@ entry:
 ; GCN: global_store_dwordx2
 ; GCN: global_load_dword
 ; GCN: global_store_dword
-define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
+define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(ptr addrspace(1) nocapture readonly %Arg, i32 %X) {
 ; CHECK-LABEL: @global_ptr_arg_clobbered_after_load(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]]
-; CHECK-NEXT:    store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4
-; CHECK-NEXT:    [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4
-; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]]
-; CHECK-NEXT:    store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[I1:%.*]] = load ptr, ptr addrspace(1) [[ARRAYIDX10]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[I1_GLOBAL:%.*]] = addrspacecast ptr [[I1]] to ptr addrspace(1)
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARRAYIDX10]], i32 [[X:%.*]]
+; CHECK-NEXT:    store ptr null, ptr addrspace(1) [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[I2:%.*]] = load float, ptr addrspace(1) [[I1_GLOBAL]], align 4
+; CHECK-NEXT:    [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[X]]
+; CHECK-NEXT:    store float [[I2]], ptr addrspace(3) [[ARRAYIDX512]], align 4
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X]], -1
-; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
-; CHECK-NEXT:    [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
+; CHECK-NEXT:    [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 [[SUB]]
+; CHECK-NEXT:    [[I6:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX711]], align 4
 ; CHECK-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]]
-; CHECK-NEXT:    store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[I1_GLOBAL]], i64 [[IDXPROM8]]
+; CHECK-NEXT:    store float [[I6]], ptr addrspace(1) [[ARRAYIDX9]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = zext i32 %i to i64
-  %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
-  %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
-  %arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X
-  store float* null, float* addrspace(1)* %arrayidx11, align 4
-  %i2 = load float, float* %i1, align 4
-  %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
-  store float %i2, float addrspace(3)* %arrayidx512, align 4
+  %arrayidx10 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i64 %idxprom
+  %i1 = load ptr, ptr addrspace(1) %arrayidx10, align 8
+  %arrayidx11 = getelementptr inbounds ptr, ptr addrspace(1) %arrayidx10, i32 %X
+  store ptr null, ptr addrspace(1) %arrayidx11, align 4
+  %i2 = load float, ptr %i1, align 4
+  %arrayidx512 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %X
+  store float %i2, ptr addrspace(3) %arrayidx512, align 4
   %sub = add nsw i32 %X, -1
-  %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
-  %i6 = load float, float addrspace(3)* %arrayidx711, align 4
+  %arrayidx711 = getelementptr inbounds [4 x float], ptr addrspace(3) @LDS, i32 0, i32 %sub
+  %i6 = load float, ptr addrspace(3) %arrayidx711, align 4
   %idxprom8 = sext i32 %X to i64
-  %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
-  store float %i6, float* %arrayidx9, align 4
+  %arrayidx9 = getelementptr inbounds float, ptr %i1, i64 %idxprom8
+  store float %i6, ptr %arrayidx9, align 4
   ret void
 }
 
 ; GCN-LABEL: ptr_nest_3_barrier:
 ; GCN-COUNT-2: global_load_dwordx2
 ; GCN:         global_store_dword
-define amdgpu_kernel void @ptr_nest_3_barrier(float** addrspace(1)* nocapture readonly %Arg) {
+define amdgpu_kernel void @ptr_nest_3_barrier(ptr addrspace(1) nocapture readonly %Arg) {
 ; CHECK-LABEL: @ptr_nest_3_barrier(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG:%.*]], i32 [[I]]
 ; CHECK-NEXT:    tail call void @llvm.amdgcn.s.barrier()
-; CHECK-NEXT:    [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
-; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
-; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
+; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
+; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2_GLOBAL]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i
+  %p1 = getelementptr inbounds ptr, ptr addrspace(1) %Arg, i32 %i
   tail call void @llvm.amdgcn.s.barrier()
-  %p2 = load float**, float** addrspace(1)* %p1, align 8
-  %p3 = load float*, float** %p2, align 8
-  store float 0.000000e+00, float* %p3, align 4
+  %p2 = load ptr, ptr addrspace(1) %p1, align 8
+  %p3 = load ptr, ptr %p2, align 8
+  store float 0.000000e+00, ptr %p3, align 4
   ret void
 }
 
@@ -344,20 +340,20 @@ entry:
 ; GCN: s_lshl_b64
 ; GCN: s_load_dwordx2
 ; GCN: global_store_dword
-define amdgpu_kernel void @flat_ptr_nest_2(float** nocapture readonly %Arg, i32 %i) {
+define amdgpu_kernel void @flat_ptr_nest_2(ptr nocapture readonly %Arg, i32 %i) {
 ; CHECK-LABEL: @flat_ptr_nest_2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]]
-; CHECK-NEXT:    [[P2:%.*]] = load float*, float* addrspace(1)* [[P1]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)*
-; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4
+; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = load ptr, ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
-  %p2 = load float*, float** %p1, align 8
-  store float 0.000000e+00, float* %p2, align 4
+  %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
+  %p2 = load ptr, ptr %p1, align 8
+  store float 0.000000e+00, ptr %p2, align 4
   ret void
 }
 
@@ -366,21 +362,21 @@ entry:
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2
 ; GCN: global_store_dword
-define amdgpu_kernel void @const_ptr_nest_3(float* addrspace(4)* addrspace(4)* nocapture readonly %Arg, i32 %i) {
+define amdgpu_kernel void @const_ptr_nest_3(ptr addrspace(4) nocapture readonly %Arg, i32 %i) {
 ; CHECK-LABEL: @const_ptr_nest_3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[ARG:%.*]], i32 [[I:%.*]]
-; CHECK-NEXT:    [[P2:%.*]] = load float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[P1]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(4)* [[P2]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
-; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[TMP0]], align 4
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[ARG:%.*]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[P1]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr addrspace(4) [[P2]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %p1 = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* %Arg, i32 %i
-  %p2 = load float* addrspace(4)*, float * addrspace(4)* addrspace(4)* %p1, align 8
-  %p3 = load float*, float* addrspace(4)* %p2, align 8
-  store float 0.000000e+00, float* %p3, align 4
+  %p1 = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) %Arg, i32 %i
+  %p2 = load ptr addrspace(4), ptr addrspace(4) %p1, align 8
+  %p3 = load ptr, ptr addrspace(4) %p2, align 8
+  store float 0.000000e+00, ptr %p3, align 4
   ret void
 }
 
@@ -389,23 +385,23 @@ entry:
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2
 ; GCN: global_store_dword
-define amdgpu_kernel void @cast_from_const_const_ptr_nest_3(float* addrspace(4)* addrspace(4)* nocapture readonly %Arg, i32 %i) {
+define amdgpu_kernel void @cast_from_const_const_ptr_nest_3(ptr addrspace(4) nocapture readonly %Arg, i32 %i) {
 ; CHECK-LABEL: @cast_from_const_const_ptr_nest_3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[ARG:%.*]], i32 [[I:%.*]]
-; CHECK-NEXT:    [[P2:%.*]] = load float* addrspace(4)*, float* addrspace(4)* addrspace(4)* [[P1]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(4)* [[P2]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
-; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[ARG:%.*]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[P1]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr addrspace(4) [[P2]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %p1 = getelementptr inbounds float* addrspace(4)*, float* addrspace(4)* addrspace(4)* %Arg, i32 %i
-  %a1 = addrspacecast float* addrspace(4)* addrspace(4)* %p1 to float* addrspace(4)**
-  %p2 = load float* addrspace(4)*, float* addrspace(4)** %a1, align 8
-  %a2 = addrspacecast float* addrspace(4)* %p2 to float**
-  %p3 = load float*, float** %a2, align 8
-  store float 0.000000e+00, float* %p3, align 4
+  %p1 = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) %Arg, i32 %i
+  %a1 = addrspacecast ptr addrspace(4) %p1 to ptr
+  %p2 = load ptr addrspace(4), ptr %a1, align 8
+  %a2 = addrspacecast ptr addrspace(4) %p2 to ptr
+  %p3 = load ptr, ptr %a2, align 8
+  store float 0.000000e+00, ptr %p3, align 4
   ret void
 }
 
@@ -413,21 +409,21 @@ entry:
 ; GCN: s_lshl_b64
 ; GCN: flat_load_dwordx2
 ; GCN: global_store_dword
-define amdgpu_kernel void @flat_ptr_volatile_load(float** nocapture readonly %Arg, i32 %i) {
+define amdgpu_kernel void @flat_ptr_volatile_load(ptr nocapture readonly %Arg, i32 %i) {
 ; CHECK-LABEL: @flat_ptr_volatile_load(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]]
-; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast float* addrspace(1)* [[P1]] to float**
-; CHECK-NEXT:    [[P2:%.*]] = load volatile float*, float** [[TMP0]], align 8
-; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)*
-; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4
+; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
+; CHECK-NEXT:    [[P2:%.*]] = load volatile ptr, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
-  %p2 = load volatile float*, float** %p1, align 8
-  store float 0.000000e+00, float* %p2, align 4
+  %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
+  %p2 = load volatile ptr, ptr %p1, align 8
+  store float 0.000000e+00, ptr %p2, align 4
   ret void
 }
 
@@ -435,20 +431,20 @@ entry:
 ; GCN: s_lshl_b64
 ; GCN: global_load_dwordx2
 ; GCN: global_store_dword
-define amdgpu_kernel void @flat_ptr_atomic_load(float** nocapture readonly %Arg, i32 %i) {
+define amdgpu_kernel void @flat_ptr_atomic_load(ptr nocapture readonly %Arg, i32 %i) {
 ; CHECK-LABEL: @flat_ptr_atomic_load(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I:%.*]]
-; CHECK-NEXT:    [[P2:%.*]] = load atomic float*, float* addrspace(1)* [[P1]] monotonic, align 8
-; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast float* [[P2]] to float addrspace(1)*
-; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P2_GLOBAL]], align 4
+; CHECK-NEXT:    [[ARG_GLOBAL:%.*]] = addrspacecast ptr [[ARG:%.*]] to ptr addrspace(1)
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr, ptr addrspace(1) [[ARG_GLOBAL]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = load atomic ptr, ptr addrspace(1) [[P1]] monotonic, align 8
+; CHECK-NEXT:    [[P2_GLOBAL:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(1)
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P2_GLOBAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %p1 = getelementptr inbounds float*, float** %Arg, i32 %i
-  %p2 = load atomic float*, float** %p1 monotonic, align 8
-  store float 0.000000e+00, float* %p2, align 4
+  %p1 = getelementptr inbounds ptr, ptr %Arg, i32 %i
+  %p2 = load atomic ptr, ptr %p1 monotonic, align 8
+  store float 0.000000e+00, ptr %p2, align 4
   ret void
 }
 
@@ -457,25 +453,23 @@ entry:
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2
 ; GCN: global_store_dword
-define amdgpu_kernel void @cast_changing_pointee_type(float* addrspace(1)* addrspace(1)* nocapture readonly %Arg, i32 %i) {
+define amdgpu_kernel void @cast_changing_pointee_type(ptr addrspace(1) nocapture readonly %Arg, i32 %i) {
 ; CHECK-LABEL: @cast_changing_pointee_type(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float* addrspace(1)*, float* addrspace(1)* addrspace(1)* [[ARG:%.*]], i32 [[I:%.*]]
-; CHECK-NEXT:    [[A1:%.*]] = bitcast float* addrspace(1)* addrspace(1)* [[P1]] to i32* addrspace(1)* addrspace(1)*
-; CHECK-NEXT:    [[P2:%.*]] = load i32* addrspace(1)*, i32* addrspace(1)* addrspace(1)* [[A1]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[A2:%.*]] = bitcast i32* addrspace(1)* [[P2]] to float* addrspace(1)*
-; CHECK-NEXT:    [[P3:%.*]] = load float*, float* addrspace(1)* [[A2]], align 8, !amdgpu.noclobber !0
-; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
-; CHECK-NEXT:    store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[P2:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[P1]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P3:%.*]] = load ptr, ptr addrspace(1) [[P2]], align 8, !amdgpu.noclobber !0
+; CHECK-NEXT:    [[P3_GLOBAL:%.*]] = addrspacecast ptr [[P3]] to ptr addrspace(1)
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[P3_GLOBAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %p1 = getelementptr inbounds float* addrspace(1)*, float* addrspace(1)* addrspace(1)* %Arg, i32 %i
-  %a1 = addrspacecast float* addrspace(1)* addrspace(1)* %p1 to i32* addrspace(1)**
-  %p2 = load i32* addrspace(1)*, i32* addrspace(1)** %a1, align 8
-  %a2 = addrspacecast i32* addrspace(1)* %p2 to float**
-  %p3 = load float*, float** %a2, align 8
-  store float 0.000000e+00, float* %p3, align 4
+  %p1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %Arg, i32 %i
+  %a1 = addrspacecast ptr addrspace(1) %p1 to ptr
+  %p2 = load ptr addrspace(1), ptr %a1, align 8
+  %a2 = addrspacecast ptr addrspace(1) %p2 to ptr
+  %p3 = load ptr, ptr %a2, align 8
+  store float 0.000000e+00, ptr %p3, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
index 96ada95a0821..dabb9d43bf3d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
@@ -6,65 +6,65 @@
 ; This type promotion on smaller aligned loads can cause a page fault error
 ; while accessing one extra dword beyond the buffer.
 
-define protected amdgpu_kernel void @load_v3i32_align4(<3 x i32> addrspace(1)* %arg) #0 {
+define protected amdgpu_kernel void @load_v3i32_align4(ptr addrspace(1) %arg) #0 {
 ; GCN-LABEL: load_v3i32_align4:
 ; GCN:       ; %bb.0:
 ; GCN:         s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
 ; GCN-NEXT:    s_load_dword s{{[0-9]+}}, s[0:1], 0x8
-  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 4
-  store <3 x i32> %vec, <3 x i32> addrspace(1)* undef, align 4
+  %vec = load <3 x i32>, ptr addrspace(1) %arg, align 4
+  store <3 x i32> %vec, ptr addrspace(1) undef, align 4
   ret void
 }
 
-define protected amdgpu_kernel void @load_v3i32_align8(<3 x i32> addrspace(1)* %arg) #0 {
+define protected amdgpu_kernel void @load_v3i32_align8(ptr addrspace(1) %arg) #0 {
 ; GCN-LABEL: load_v3i32_align8:
 ; GCN:       ; %bb.0:
 ; GCN:         s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
-  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 8
-  store <3 x i32> %vec, <3 x i32> addrspace(1)* undef, align 8
+  %vec = load <3 x i32>, ptr addrspace(1) %arg, align 8
+  store <3 x i32> %vec, ptr addrspace(1) undef, align 8
   ret void
 }
 
-define protected amdgpu_kernel void @load_v3i32_align16(<3 x i32> addrspace(1)* %arg) #0 {
+define protected amdgpu_kernel void @load_v3i32_align16(ptr addrspace(1) %arg) #0 {
 ; GCN-LABEL: load_v3i32_align16:
 ; GCN:       ; %bb.0:
 ; GCN:         s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
-  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 16
-  store <3 x i32> %vec, <3 x i32> addrspace(1)* undef, align 16
+  %vec = load <3 x i32>, ptr addrspace(1) %arg, align 16
+  store <3 x i32> %vec, ptr addrspace(1) undef, align 16
   ret void
 }
 
-define protected amdgpu_kernel void @load_v3f32_align4(<3 x float> addrspace(1)* %arg) #0 {
+define protected amdgpu_kernel void @load_v3f32_align4(ptr addrspace(1) %arg) #0 {
 ; GCN-LABEL: load_v3f32_align4:
 ; GCN:       ; %bb.0:
 ; GCN:         s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
 ; GCN-NEXT:    s_load_dword s{{[0-9]+}}, s[0:1], 0x8
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %arg, align 4
-  store <3 x float> %vec, <3 x float> addrspace(1)* undef, align 4
+  %vec = load <3 x float>, ptr addrspace(1) %arg, align 4
+  store <3 x float> %vec, ptr addrspace(1) undef, align 4
   ret void
 }
 
-define protected amdgpu_kernel void @load_v3f32_align8(<3 x float> addrspace(1)* %arg) #0 {
+define protected amdgpu_kernel void @load_v3f32_align8(ptr addrspace(1) %arg) #0 {
 ; GCN-LABEL: load_v3f32_align8:
 ; GCN:       ; %bb.0:
 ; GCN:         s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %arg, align 8
-  store <3 x float> %vec, <3 x float> addrspace(1)* undef, align 8
+  %vec = load <3 x float>, ptr addrspace(1) %arg, align 8
+  store <3 x float> %vec, ptr addrspace(1) undef, align 8
   ret void
 }
 
-define protected amdgpu_kernel void @load_v3f32_align16(<3 x float> addrspace(1)* %arg) #0 {
+define protected amdgpu_kernel void @load_v3f32_align16(ptr addrspace(1) %arg) #0 {
 ; GCN-LABEL: load_v3f32_align16:
 ; GCN:       ; %bb.0:
 ; GCN:         s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
-  %vec = load <3 x float>, <3 x float> addrspace(1)* %arg, align 16
-  store <3 x float> %vec, <3 x float> addrspace(1)* undef, align 16
+  %vec = load <3 x float>, ptr addrspace(1) %arg, align 16
+  store <3 x float> %vec, ptr addrspace(1) undef, align 16
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
index 370c5cc17e91..7b05864f7a4a 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
@@ -3,27 +3,27 @@
 ; passed to the original call instruction as an argument.
 ;
 ; Example:
-; `call void @f(void ()* @g)`
+; `call void @f(ptr @g)`
 ; could become
-; `call void @g(void ()* @g.1)`
+; `call void @g(ptr @g.1)`
 ; which is invalid IR.
 
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
 
 ; CHECK-LABEL: define amdgpu_kernel void @thiswasabug() #0
-; CHECK-NOT: call void @g(void ()* @g.1)
-; CHECK-DAG: call void @f(void ()* @g.1)
+; CHECK-NOT: call void @g(ptr @g.1)
+; CHECK-DAG: call void @f(ptr @g.1)
 ; CHECK-DAG: call void @g()
 define amdgpu_kernel void @thiswasabug() #0 {
     ; no replacement, but @g should be renamed to @g.1
-    call void @f(void ()* @g)
+    call void @f(ptr @g)
 
     ; this should call the clone, which takes the name @g
     call void @g()
     ret void
 }
 
-define private void @f(void ()* nocapture %0) #0 {
+define private void @f(ptr nocapture %0) #0 {
     ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index 79ba68b33909..7e0a486c8191 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -30,12 +30,12 @@ declare i64 @llvm.readcyclecounter() #0
 ; GETREG:      v_mov_b32_e32 v[[VCNT2:[0-9]+]], [[CNT2]]
 ; GETREG:      global_store_{{dwordx2|b64}} v{{.+}}, v[[[VCNT2]]:[[ZERO]]]
 
-define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_readcyclecounter(ptr addrspace(1) %out) #0 {
   %cycle0 = call i64 @llvm.readcyclecounter()
-  store volatile i64 %cycle0, i64 addrspace(1)* %out
+  store volatile i64 %cycle0, ptr addrspace(1) %out
 
   %cycle1 = call i64 @llvm.readcyclecounter()
-  store volatile i64 %cycle1, i64 addrspace(1)* %out
+  store volatile i64 %cycle1, ptr addrspace(1) %out
   ret void
 }
 
@@ -45,9 +45,9 @@ define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
 ; MEMTIME-DAG: s_memtime
 ; GCN-DAG:     s_load_{{dword|b32|b64}}
 ; GETREG-DAG:  s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_SHADER_CYCLES, 0, 20)
-define amdgpu_cs i32 @test_readcyclecounter_smem(i64 addrspace(4)* inreg %in) #0 {
+define amdgpu_cs i32 @test_readcyclecounter_smem(ptr addrspace(4) inreg %in) #0 {
   %cycle0 = call i64 @llvm.readcyclecounter()
-  %in.v = load i64, i64 addrspace(4)* %in
+  %in.v = load i64, ptr addrspace(4) %in
   %r.64 = add i64 %cycle0, %in.v
   %r.32 = trunc i64 %r.64 to i32
   ret i32 %r.32

diff  --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll
index 9aedfad6fe32..076d3cea2804 100644
--- a/llvm/test/CodeGen/AMDGPU/recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursion.ll
@@ -5,7 +5,7 @@
 ; CHECK: ScratchSize: 16
 define void @recursive() {
   call void @recursive()
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   ret void
 }
 
@@ -24,7 +24,7 @@ define void @calls_tail_recursive() norecurse {
 ; CHECK-LABEL: {{^}}tail_recursive_with_stack:
 define void @tail_recursive_with_stack() {
   %alloca = alloca i32, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 0, ptr addrspace(5) %alloca
   tail call void @tail_recursive_with_stack()
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
index 5a0c5cbc31c3..b83e49d07e49 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -6,32 +6,32 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, [[VAL]]
 ; GCN: buffer_store_dwordx2
-define amdgpu_kernel void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
-  %a = load i64, i64 addrspace(1)* %in, align 4
+define amdgpu_kernel void @reduce_i64_load_align_4_width_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i64, ptr addrspace(1) %in, align 4
   %and = and i64 %a, 1234567
-  store i64 %and, i64 addrspace(1)* %out, align 8
+  store i64 %and, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt0:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: buffer_store_dword [[VAL]]
-define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
-  %a = load i64, i64 addrspace(1)* %in, align 4
+define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i64, ptr addrspace(1) %in, align 4
   %vec = bitcast i64 %a to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
-  store i32 %elt0, i32 addrspace(1)* %out
+  store i32 %elt0, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt1:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
 ; GCN: buffer_store_dword [[VAL]]
-define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
-  %a = load i64, i64 addrspace(1)* %in, align 4
+define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %a = load i64, ptr addrspace(1) %in, align 4
   %vec = bitcast i64 %a to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 1
-  store i32 %elt0, i32 addrspace(1)* %out
+  store i32 %elt0, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
index aae8e649defa..9f6d5697c3a4 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -3,9 +3,9 @@
 ; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4:
 ; GCN: s_load_dwordx2
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define amdgpu_kernel void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+define amdgpu_kernel void @store_v2i32_as_v4i16_align_4(ptr addrspace(3) align 4 %out, <2 x i32> %x) #0 {
   %x.bc = bitcast <2 x i32> %x to <4 x i16>
-  store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
+  store <4 x i16> %x.bc, ptr addrspace(3) %out, align 4
   ret void
 }
 
@@ -13,18 +13,18 @@ define amdgpu_kernel void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)*
 ; GCN: s_load_dwordx4
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define amdgpu_kernel void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_v4i32_as_v8i16_align_4(ptr addrspace(3) align 4 %out, <4 x i32> %x) #0 {
   %x.bc = bitcast <4 x i32> %x to <8 x i16>
-  store <8 x i16> %x.bc, <8 x i16> addrspace(3)* %out, align 4
+  store <8 x i16> %x.bc, ptr addrspace(3) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}store_v2i32_as_i64_align_4:
 ; GCN: s_load_dwordx2
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define amdgpu_kernel void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+define amdgpu_kernel void @store_v2i32_as_i64_align_4(ptr addrspace(3) align 4 %out, <2 x i32> %x) #0 {
   %x.bc = bitcast <2 x i32> %x to <4 x i16>
-  store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
+  store <4 x i16> %x.bc, ptr addrspace(3) %out, align 4
   ret void
 }
 
@@ -32,9 +32,9 @@ define amdgpu_kernel void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* al
 ; GCN: s_load_dwordx4
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(ptr addrspace(3) align 4 %out, <4 x i32> %x) #0 {
   %x.bc = bitcast <4 x i32> %x to <2 x i64>
-  store <2 x i64> %x.bc, <2 x i64> addrspace(3)* %out, align 4
+  store <2 x i64> %x.bc, ptr addrspace(3) %out, align 4
   ret void
 }
 
@@ -44,9 +44,9 @@ define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)*
 ; GCN-NOT: {{buffer|flat|global}}
 
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
+define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(ptr addrspace(3) align 4 %out, <4 x i16> %x) #0 {
   %x.bc = bitcast <4 x i16> %x to <2 x i32>
-  store <2 x i32> %x.bc, <2 x i32> addrspace(3)* %out, align 4
+  store <2 x i32> %x.bc, ptr addrspace(3) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/reorder-stores.ll b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
index a379a646bee9..2cb5cd1ec9c5 100644
--- a/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -7,11 +7,11 @@
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define amdgpu_kernel void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
-  %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16
-  %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16
-  store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
-  store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16
+define amdgpu_kernel void @no_reorder_v2f64_global_load_store(ptr addrspace(1) nocapture %x, ptr addrspace(1) nocapture %y) nounwind {
+  %tmp1 = load <2 x double>, ptr addrspace(1) %x, align 16
+  %tmp4 = load <2 x double>, ptr addrspace(1) %y, align 16
+  store <2 x double> %tmp4, ptr addrspace(1) %x, align 16
+  store <2 x double> %tmp1, ptr addrspace(1) %y, align 16
   ret void
 }
 
@@ -23,11 +23,11 @@ define amdgpu_kernel void @no_reorder_v2f64_global_load_store(<2 x double> addrs
 ; VI: ds_write_b128
 
 ; GCN: s_endpgm
-define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
-  %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
-  %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16
-  store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
-  store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16
+define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(ptr addrspace(3) nocapture %x, ptr addrspace(3) nocapture %y) nounwind {
+  %tmp1 = load <2 x double>, ptr addrspace(3) %x, align 16
+  %tmp4 = load <2 x double>, ptr addrspace(3) %y, align 16
+  store <2 x double> %tmp4, ptr addrspace(3) %x, align 16
+  store <2 x double> %tmp1, ptr addrspace(3) %y, align 16
   ret void
 }
 
@@ -43,11 +43,11 @@ define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(<2 x dou
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define amdgpu_kernel void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
-  %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
-  %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32
-  store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
-  store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32
+define amdgpu_kernel void @no_reorder_split_v8i32_global_load_store(ptr addrspace(1) nocapture %x, ptr addrspace(1) nocapture %y) nounwind {
+  %tmp1 = load <8 x i32>, ptr addrspace(1) %x, align 32
+  %tmp4 = load <8 x i32>, ptr addrspace(1) %y, align 32
+  store <8 x i32> %tmp4, ptr addrspace(1) %x, align 32
+  store <8 x i32> %tmp1, ptr addrspace(1) %y, align 32
   ret void
 }
 
@@ -58,16 +58,16 @@ define amdgpu_kernel void @no_reorder_split_v8i32_global_load_store(<8 x i32> ad
 ; GCN-NOT: ds_read
 ; GCN: ds_write_b64
 ; GCN: s_endpgm
-define amdgpu_kernel void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
-  %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8
-  %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8
+define amdgpu_kernel void @no_reorder_extload_64(ptr addrspace(3) nocapture %x, ptr addrspace(3) nocapture %y) nounwind {
+  %tmp1 = load <2 x i32>, ptr addrspace(3) %x, align 8
+  %tmp4 = load <2 x i32>, ptr addrspace(3) %y, align 8
   %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
   %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64>
   %tmp7 = add <2 x i64> %tmp1ext, <i64 1, i64 1>
   %tmp9 = add <2 x i64> %tmp4ext, <i64 1, i64 1>
   %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32>
   %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32>
-  store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8
-  store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8
+  store <2 x i32> %trunctmp9, ptr addrspace(3) %x, align 8
+  store <2 x i32> %trunctmp7, ptr addrspace(3) %y, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
index d1ec9d8afaaf..0211bdac2fe6 100644
--- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
@@ -5,140 +5,127 @@ target datalayout = "n32"
 
 ; CHECK-LABEL: @invalid_reqd_work_group_size(
 ; CHECK: load i16,
-define amdgpu_kernel void @invalid_reqd_work_group_size(i16 addrspace(1)* %out) #0 !reqd_work_group_size !1 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  store i16 %group.size.x, i16 addrspace(1)* %out
+define amdgpu_kernel void @invalid_reqd_work_group_size(ptr addrspace(1) %out) #0 !reqd_work_group_size !1 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  store i16 %group.size.x, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @volatile_load_group_size_x(
 ; CHECK: load volatile i16,
-define amdgpu_kernel void @volatile_load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load volatile i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  store i16 %group.size.x, i16 addrspace(1)* %out
+define amdgpu_kernel void @volatile_load_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load volatile i16, ptr addrspace(4) %gep.group.size.x, align 4
+  store i16 %group.size.x, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @load_group_size_x(
 ; CHECK-NEXT: store i16 8,
-define amdgpu_kernel void @load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  store i16 %group.size.x, i16 addrspace(1)* %out
+define amdgpu_kernel void @load_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  store i16 %group.size.x, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @load_group_size_y(
 ; CHECK-NEXT: store i16 16,
-define amdgpu_kernel void @load_group_size_y(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6
-  %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
-  %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
-  store i16 %group.size.y, i16 addrspace(1)* %out
+define amdgpu_kernel void @load_group_size_y(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.y = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 6
+  %group.size.y = load i16, ptr addrspace(4) %gep.group.size.y, align 4
+  store i16 %group.size.y, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @load_group_size_z(
 ; CHECK-NEXT: store i16 2,
-define amdgpu_kernel void @load_group_size_z(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8
-  %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)*
-  %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4
-  store i16 %group.size.z, i16 addrspace(1)* %out
+define amdgpu_kernel void @load_group_size_z(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.z = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 8
+  %group.size.z = load i16, ptr addrspace(4) %gep.group.size.z, align 4
+  store i16 %group.size.z, ptr addrspace(1) %out
   ret void
 }
 
 ; Metadata uses i64 instead of i32
 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i64(
 ; CHECK-NEXT: store i16 8,
-define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i64(i16 addrspace(1)* %out) #0 !reqd_work_group_size !2 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  store i16 %group.size.x, i16 addrspace(1)* %out
+define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i64(ptr addrspace(1) %out) #0 !reqd_work_group_size !2 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  store i16 %group.size.x, ptr addrspace(1) %out
   ret void
 }
 
 ; Metadata uses i16 instead of i32
 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i16(
 ; CHECK-NEXT: store i16 8,
-define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i16(i16 addrspace(1)* %out) #0 !reqd_work_group_size !3 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  store i16 %group.size.x, i16 addrspace(1)* %out
+define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i16(ptr addrspace(1) %out) #0 !reqd_work_group_size !3 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  store i16 %group.size.x, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @use_local_size_x_8_16_2(
 ; CHECK-NEXT: store i64 8,
-define amdgpu_kernel void @use_local_size_x_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
-  %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
-  %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
+define amdgpu_kernel void @use_local_size_x_8_16_2(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
+  %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %group.size.x.zext = zext i16 %group.size.x to i32
   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
   %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
   %zext = zext i32 %umin to i64
-  store i64 %zext, i64 addrspace(1)* %out
+  store i64 %zext, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @use_local_size_y_8_16_2(
 ; CHECK-NEXT: store i64 16,
-define amdgpu_kernel void @use_local_size_y_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6
-  %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
-  %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
-  %gep.grid.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16
-  %gep.grid.size.y.bc = bitcast i8 addrspace(4)* %gep.grid.size.y to i32 addrspace(4)*
-  %grid.size.y = load i32, i32 addrspace(4)* %gep.grid.size.y.bc, align 4
+define amdgpu_kernel void @use_local_size_y_8_16_2(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.y = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 6
+  %group.size.y = load i16, ptr addrspace(4) %gep.group.size.y, align 4
+  %gep.grid.size.y = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 16
+  %grid.size.y = load i32, ptr addrspace(4) %gep.grid.size.y, align 4
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
   %group.size.y.zext = zext i16 %group.size.y to i32
   %group.id_x_group.size.y = mul i32 %group.id, %group.size.y.zext
   %sub = sub i32 %grid.size.y, %group.id_x_group.size.y
   %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.y.zext)
   %zext = zext i32 %umin to i64
-  store i64 %zext, i64 addrspace(1)* %out
+  store i64 %zext, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @use_local_size_z_8_16_2(
 ; CHECK-NEXT: store i64 2,
-define amdgpu_kernel void @use_local_size_z_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8
-  %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)*
-  %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4
-  %gep.grid.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 20
-  %gep.grid.size.z.bc = bitcast i8 addrspace(4)* %gep.grid.size.z to i32 addrspace(4)*
-  %grid.size.z = load i32, i32 addrspace(4)* %gep.grid.size.z.bc, align 4
+define amdgpu_kernel void @use_local_size_z_8_16_2(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.z = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 8
+  %group.size.z = load i16, ptr addrspace(4) %gep.group.size.z, align 4
+  %gep.grid.size.z = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 20
+  %grid.size.z = load i32, ptr addrspace(4) %gep.grid.size.z, align 4
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.z()
   %group.size.z.zext = zext i16 %group.size.z to i32
   %group.id_x_group.size.z = mul i32 %group.id, %group.size.z.zext
   %sub = sub i32 %grid.size.z, %group.id_x_group.size.z
   %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.z.zext)
   %zext = zext i32 %umin to i64
-  store i64 %zext, i64 addrspace(1)* %out
+  store i64 %zext, ptr addrspace(1) %out
   ret void
 }
 
@@ -148,67 +135,61 @@ define amdgpu_kernel void @use_local_size_z_8_16_2(i64 addrspace(1)* %out) #0 !r
 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_group_id(
 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
-define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
-  %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
-  %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
+define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
+  %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
   %group.size.x.zext = zext i16 %group.size.x to i32
   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
   %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
   %zext = zext i32 %umin to i64
-  store i64 %zext, i64 addrspace(1)* %out
+  store i64 %zext, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_grid_size(
-; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
+; CHECK: %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
-  define amdgpu_kernel void @local_size_x_8_16_2_wrong_grid_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16
-  %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
-  %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
+  define amdgpu_kernel void @local_size_x_8_16_2_wrong_grid_size(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 16
+  %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %group.size.x.zext = zext i16 %group.size.x to i32
   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
   %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
   %zext = zext i32 %umin to i64
-  store i64 %zext, i64 addrspace(1)* %out
+  store i64 %zext, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_cmp_type(
-; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
+; CHECK: %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
 ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 ; CHECK: %smin = call i32 @llvm.smin.i32(i32 %sub, i32 8)
-define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
-  %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
-  %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
+define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
+  %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %group.size.x.zext = zext i16 %group.size.x to i32
   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
   %smin = call i32 @llvm.smin.i32(i32 %sub, i32 %group.size.x.zext)
   %zext = zext i32 %smin to i64
-  store i64 %zext, i64 addrspace(1)* %out
+  store i64 %zext, ptr addrspace(1) %out
   ret void
 }
 
@@ -217,38 +198,34 @@ define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(i64 addrspace(1)*
 ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
 ; CHECK: %umax = call i32 @llvm.umax.i32(i32 %sub, i32 8)
 ; CHECK: %zext = zext i32 %umax to i64
-define amdgpu_kernel void @local_size_x_8_16_2_wrong_select(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
-  %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
-  %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
+define amdgpu_kernel void @local_size_x_8_16_2_wrong_select(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
+  %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %group.size.x.zext = zext i16 %group.size.x to i32
   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
   %umax = call i32 @llvm.umax.i32(i32 %sub, i32 %group.size.x.zext)
   %zext = zext i32 %umax to i64
-  store i64 %zext, i64 addrspace(1)* %out
+  store i64 %zext, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @use_local_size_x_8_16_2_wrong_grid_load_size(
-; CHECK: %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4
+; CHECK: %grid.size.x = load i16, ptr addrspace(4) %gep.grid.size.x, align 4
 ; CHECK: %grid.size.x.zext = zext i16 %grid.size.x to i32
 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
 ; CHECK: %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x
-define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
-  %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i16 addrspace(4)*
-  %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4
+define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
+  %grid.size.x = load i16, ptr addrspace(4) %gep.grid.size.x, align 4
   %grid.size.x.zext = zext i16 %grid.size.x to i32
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %group.size.x.zext = zext i16 %group.size.x to i32
@@ -256,17 +233,16 @@ define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(i64 addr
   %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x
   %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
   %zext = zext i32 %umin to i64
-  store i64 %zext, i64 addrspace(1)* %out
+  store i64 %zext, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @func_group_size_x(
 ; CHECK-NEXT: ret i32 8
-define i32 @func_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
+define i32 @func_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
   %zext = zext i16 %group.size.x to i32
   ret i32 %zext
 }
@@ -275,7 +251,7 @@ define i32 @func_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !
 ; CHECK: %group.size = phi i32 [ 2, %bb17 ], [ 16, %bb9 ], [ 8, %bb1 ], [ 1, %bb ]
 define i64 @__ockl_get_local_size_reqd_size(i32 %arg) #1 !reqd_work_group_size !0 {
 bb:
-  %tmp = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
+  %tmp = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #2
   switch i32 %arg, label %bb25 [
     i32 0, label %bb1
     i32 1, label %bb9
@@ -284,32 +260,26 @@ bb:
 
 bb1:                                              ; preds = %bb
   %tmp2 = tail call i32 @llvm.amdgcn.workgroup.id.x()
-  %tmp3 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 12
-  %tmp4 = bitcast i8 addrspace(4)* %tmp3 to i32 addrspace(4)*
-  %tmp5 = load i32, i32 addrspace(4)* %tmp4, align 4
-  %tmp6 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 4
-  %tmp7 = bitcast i8 addrspace(4)* %tmp6 to i16 addrspace(4)*
-  %tmp8 = load i16, i16 addrspace(4)* %tmp7, align 4
+  %tmp3 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 12
+  %tmp5 = load i32, ptr addrspace(4) %tmp3, align 4
+  %tmp6 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 4
+  %tmp8 = load i16, ptr addrspace(4) %tmp6, align 4
   br label %bb25
 
 bb9:                                              ; preds = %bb
   %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.y()
-  %tmp11 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 16
-  %tmp12 = bitcast i8 addrspace(4)* %tmp11 to i32 addrspace(4)*
-  %tmp13 = load i32, i32 addrspace(4)* %tmp12, align 8
-  %tmp14 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 6
-  %tmp15 = bitcast i8 addrspace(4)* %tmp14 to i16 addrspace(4)*
-  %tmp16 = load i16, i16 addrspace(4)* %tmp15, align 2
+  %tmp11 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 16
+  %tmp13 = load i32, ptr addrspace(4) %tmp11, align 8
+  %tmp14 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 6
+  %tmp16 = load i16, ptr addrspace(4) %tmp14, align 2
   br label %bb25
 
 bb17:                                             ; preds = %bb
   %tmp18 = tail call i32 @llvm.amdgcn.workgroup.id.z()
-  %tmp19 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 20
-  %tmp20 = bitcast i8 addrspace(4)* %tmp19 to i32 addrspace(4)*
-  %tmp21 = load i32, i32 addrspace(4)* %tmp20, align 4
-  %tmp22 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 8
-  %tmp23 = bitcast i8 addrspace(4)* %tmp22 to i16 addrspace(4)*
-  %tmp24 = load i16, i16 addrspace(4)* %tmp23, align 8
+  %tmp19 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 20
+  %tmp21 = load i32, ptr addrspace(4) %tmp19, align 4
+  %tmp22 = getelementptr inbounds i8, ptr addrspace(4) %tmp, i64 8
+  %tmp24 = load i16, ptr addrspace(4) %tmp22, align 8
   br label %bb25
 
 bb25:                                             ; preds = %bb17, %bb9, %bb1, %bb
@@ -325,77 +295,71 @@ bb25:                                             ; preds = %bb17, %bb9, %bb1, %
 }
 
 ; CHECK-LABEL: @all_local_size(
-; CHECK-NEXT: store volatile i64 8, i64 addrspace(1)* %out, align 4
-; CHECK-NEXT: store volatile i64 16, i64 addrspace(1)* %out, align 4
-; CHECK-NEXT: store volatile i64 2, i64 addrspace(1)* %out, align 4
-define amdgpu_kernel void @all_local_size(i64 addrspace(1)* nocapture readnone %out) #0 !reqd_work_group_size !0 {
-  %tmp.i = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+; CHECK-NEXT: store volatile i64 8, ptr addrspace(1) %out, align 4
+; CHECK-NEXT: store volatile i64 16, ptr addrspace(1) %out, align 4
+; CHECK-NEXT: store volatile i64 2, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @all_local_size(ptr addrspace(1) nocapture readnone %out) #0 !reqd_work_group_size !0 {
+  %tmp.i = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
   %tmp2.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
-  %tmp3.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 12
-  %tmp4.i = bitcast i8 addrspace(4)* %tmp3.i to i32 addrspace(4)*
-  %tmp5.i = load i32, i32 addrspace(4)* %tmp4.i, align 4
-  %tmp6.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 4
-  %tmp7.i = bitcast i8 addrspace(4)* %tmp6.i to i16 addrspace(4)*
-  %tmp8.i = load i16, i16 addrspace(4)* %tmp7.i, align 4
+  %tmp3.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 12
+  %tmp5.i = load i32, ptr addrspace(4) %tmp3.i, align 4
+  %tmp6.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 4
+  %tmp8.i = load i16, ptr addrspace(4) %tmp6.i, align 4
   %tmp29.i = zext i16 %tmp8.i to i32
   %tmp30.i = mul i32 %tmp2.i, %tmp29.i
   %tmp31.i = sub i32 %tmp5.i, %tmp30.i
   %umin0 = call i32 @llvm.umin.i32(i32 %tmp31.i, i32 %tmp29.i)
   %tmp34.i = zext i32 %umin0 to i64
   %tmp10.i = tail call i32 @llvm.amdgcn.workgroup.id.y() #0
-  %tmp11.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 16
-  %tmp12.i = bitcast i8 addrspace(4)* %tmp11.i to i32 addrspace(4)*
-  %tmp13.i = load i32, i32 addrspace(4)* %tmp12.i, align 8
-  %tmp14.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 6
-  %tmp15.i = bitcast i8 addrspace(4)* %tmp14.i to i16 addrspace(4)*
-  %tmp16.i = load i16, i16 addrspace(4)* %tmp15.i, align 2
+  %tmp11.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 16
+  %tmp13.i = load i32, ptr addrspace(4) %tmp11.i, align 8
+  %tmp14.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 6
+  %tmp16.i = load i16, ptr addrspace(4) %tmp14.i, align 2
   %tmp29.i9 = zext i16 %tmp16.i to i32
   %tmp30.i10 = mul i32 %tmp10.i, %tmp29.i9
   %tmp31.i11 = sub i32 %tmp13.i, %tmp30.i10
   %umin1 = call i32 @llvm.umin.i32(i32 %tmp31.i11, i32 %tmp29.i9)
   %tmp34.i14 = zext i32 %umin1 to i64
   %tmp18.i = tail call i32 @llvm.amdgcn.workgroup.id.z() #0
-  %tmp19.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 20
-  %tmp20.i = bitcast i8 addrspace(4)* %tmp19.i to i32 addrspace(4)*
-  %tmp21.i = load i32, i32 addrspace(4)* %tmp20.i, align 4
-  %tmp22.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 8
-  %tmp23.i = bitcast i8 addrspace(4)* %tmp22.i to i16 addrspace(4)*
-  %tmp24.i = load i16, i16 addrspace(4)* %tmp23.i, align 8
+  %tmp19.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 20
+  %tmp21.i = load i32, ptr addrspace(4) %tmp19.i, align 4
+  %tmp22.i = getelementptr inbounds i8, ptr addrspace(4) %tmp.i, i64 8
+  %tmp24.i = load i16, ptr addrspace(4) %tmp22.i, align 8
   %tmp29.i2 = zext i16 %tmp24.i to i32
   %tmp30.i3 = mul i32 %tmp18.i, %tmp29.i2
   %tmp31.i4 = sub i32 %tmp21.i, %tmp30.i3
   %umin2 = call i32 @llvm.umin.i32(i32 %tmp31.i4, i32 %tmp29.i2)
   %tmp34.i7 = zext i32 %umin2 to i64
-  store volatile i64 %tmp34.i, i64 addrspace(1)* %out, align 4
-  store volatile i64 %tmp34.i14, i64 addrspace(1)* %out, align 4
-  store volatile i64 %tmp34.i7, i64 addrspace(1)* %out, align 4
+  store volatile i64 %tmp34.i, ptr addrspace(1) %out, align 4
+  store volatile i64 %tmp34.i14, ptr addrspace(1) %out, align 4
+  store volatile i64 %tmp34.i7, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; TODO: Should be able to handle this, but not much reason to.
 ; CHECK-LABEL: @partial_load_group_size_x(
-; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 4
-; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1
-define amdgpu_kernel void @partial_load_group_size_x(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
-  store i8 %group.size.x.lo, i8 addrspace(1)* %out
+; CHECK-NEXT: %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+; CHECK-NEXT: %group.size.x.lo = load i8, ptr addrspace(4) %gep.group.size.x, align 4
+; CHECK-NEXT: store i8 %group.size.x.lo, ptr addrspace(1) %out, align 1
+define amdgpu_kernel void @partial_load_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x.lo = load i8, ptr addrspace(4) %gep.group.size.x, align 1
+  store i8 %group.size.x.lo, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @partial_load_group_size_x_explicit_callsite_align(
-; CHECK-NEXT: %dispatch.ptr = tail call align 2 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 2
-; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1
-define amdgpu_kernel void @partial_load_group_size_x_explicit_callsite_align(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call align 2 i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
-  store i8 %group.size.x.lo, i8 addrspace(1)* %out
+; CHECK-NEXT: %dispatch.ptr = tail call align 2 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+; CHECK-NEXT: %group.size.x.lo = load i8, ptr addrspace(4) %gep.group.size.x, align 2
+; CHECK-NEXT: store i8 %group.size.x.lo, ptr addrspace(1) %out, align 1
+define amdgpu_kernel void @partial_load_group_size_x_explicit_callsite_align(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call align 2 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x.lo = load i8, ptr addrspace(4) %gep.group.size.x, align 1
+  store i8 %group.size.x.lo, ptr addrspace(1) %out
   ret void
 }
 
@@ -403,87 +367,79 @@ define amdgpu_kernel void @partial_load_group_size_x_explicit_callsite_align(i8
 ; CHECK-LABEL: @load_group_size_xy_i32(
 ; CHECK: %group.size.xy = load i32,
 ; CHECK: store i32 %group.size.xy
-define amdgpu_kernel void @load_group_size_xy_i32(i32 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i32 addrspace(4)*
-  %group.size.xy = load i32, i32 addrspace(4)* %gep.group.size.x.bc, align 4
-  store i32 %group.size.xy, i32 addrspace(1)* %out
+define amdgpu_kernel void @load_group_size_xy_i32(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.xy = load i32, ptr addrspace(4) %gep.group.size.x, align 4
+  store i32 %group.size.xy, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @load_group_size_x_y_multiple_dispatch_ptr(
-; CHECK-NEXT: store volatile i16 8, i16 addrspace(1)* %out, align 2
-; CHECK-NEXT: store volatile i16 16, i16 addrspace(1)* %out, align 2
-define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
-  %dispatch.ptr0 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr0, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  store volatile i16 %group.size.x, i16 addrspace(1)* %out
-
-  %dispatch.ptr1 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr1, i64 6
-  %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
-  %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
-  store volatile i16 %group.size.y, i16 addrspace(1)* %out
+; CHECK-NEXT: store volatile i16 8, ptr addrspace(1) %out, align 2
+; CHECK-NEXT: store volatile i16 16, ptr addrspace(1) %out, align 2
+define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %dispatch.ptr0 = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr0, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  store volatile i16 %group.size.x, ptr addrspace(1) %out
+
+  %dispatch.ptr1 = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.y = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr1, i64 6
+  %group.size.y = load i16, ptr addrspace(4) %gep.group.size.y, align 4
+  store volatile i16 %group.size.y, ptr addrspace(1) %out
 
   ret void
 }
 
 ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size(
-; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-; CHECK-NEXT: %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-; CHECK-NEXT: %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
+; CHECK-NEXT: %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+; CHECK-NEXT: %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
 ; CHECK-NEXT: %zext = zext i16 %group.size.x to i64
-; CHECK-NEXT: store i64 %zext, i64 addrspace(1)* %out, align 4
-define amdgpu_kernel void @use_local_size_x_uniform_work_group_size(i64 addrspace(1)* %out) #2 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
-  %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
-  %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
+; CHECK-NEXT: store i64 %zext, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @use_local_size_x_uniform_work_group_size(ptr addrspace(1) %out) #2 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
+  %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %group.size.x.zext = zext i16 %group.size.x to i32
   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
   %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
   %zext = zext i32 %umin to i64
-  store i64 %zext, i64 addrspace(1)* %out
+  store i64 %zext, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size_false(
 ; CHECK: call i32 @llvm.umin
-define amdgpu_kernel void @use_local_size_x_uniform_work_group_size_false(i64 addrspace(1)* %out) #3 {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-  %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
-  %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
-  %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
-  %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
-  %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
-  %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
+define amdgpu_kernel void @use_local_size_x_uniform_work_group_size_false(ptr addrspace(1) %out) #3 {
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4
+  %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4
+  %gep.grid.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 12
+  %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4
   %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %group.size.x.zext = zext i16 %group.size.x to i32
   %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
   %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
   %umin = call i32 @llvm.umin.i32(i32 %sub, i32 %group.size.x.zext)
   %zext = zext i32 %umin to i64
-  store i64 %zext, i64 addrspace(1)* %out
+  store i64 %zext, ptr addrspace(1) %out
   ret void
 }
 
 ; CHECK-LABEL: @no_use_dispatch_ptr(
 ; CHECK-NEXT: ret void
 define amdgpu_kernel void @no_use_dispatch_ptr() {
-  %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+  %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
   ret void
 }
 
-declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 declare i32 @llvm.amdgcn.workgroup.id.y() #1
 declare i32 @llvm.amdgcn.workgroup.id.z() #1

diff  --git a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
index 1c8b8be33b0a..664d08652e0a 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
@@ -4,13 +4,13 @@
 ; Make sure there's no assertion when trying to report the resource
 ; usage for a function which becomes dead during codegen.
 
- at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant void()*, align 4
+ at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
 
 ; GCN-LABEL: unreachable:
 ; Function info:
 ; codeLenInByte = 4
 define internal fastcc void @unreachable() {
-  %fptr = load void()*, void()* addrspace(4)* @gv.fptr0
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
   unreachable
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll
index f5cb61ab5efa..d8afaf18b518 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll
@@ -3,31 +3,29 @@
 ; CHECK: %void_one_out_non_private_arg_i32_1_use = type { i32 }
 ; CHECK: %bitcast_pointer_as1 = type { <4 x i32> }
 
-; CHECK-LABEL: define private %void_one_out_non_private_arg_i32_1_use @void_one_out_non_private_arg_i32_1_use.body(i32 addrspace(1)* %val) #0 {
+; CHECK-LABEL: define private %void_one_out_non_private_arg_i32_1_use @void_one_out_non_private_arg_i32_1_use.body(ptr addrspace(1) %val) #0 {
 ; CHECK-NEXT: ret %void_one_out_non_private_arg_i32_1_use zeroinitializer
 
-; CHECK-LABEL: define void @void_one_out_non_private_arg_i32_1_use(i32 addrspace(1)* %0) #1 {
-; CHECK-NEXT: %2 = call %void_one_out_non_private_arg_i32_1_use @void_one_out_non_private_arg_i32_1_use.body(i32 addrspace(1)* poison)
+; CHECK-LABEL: define void @void_one_out_non_private_arg_i32_1_use(ptr addrspace(1) %0) #1 {
+; CHECK-NEXT: %2 = call %void_one_out_non_private_arg_i32_1_use @void_one_out_non_private_arg_i32_1_use.body(ptr addrspace(1) poison)
 ; CHECK-NEXT: %3 = extractvalue %void_one_out_non_private_arg_i32_1_use %2, 0
-; CHECK-NEXT: store i32 %3, i32 addrspace(1)* %0, align 4
+; CHECK-NEXT: store i32 %3, ptr addrspace(1) %0, align 4
 ; CHECK-NEXT: ret void
-define void @void_one_out_non_private_arg_i32_1_use(i32 addrspace(1)* %val) #0 {
-  store i32 0, i32 addrspace(1)* %val
+define void @void_one_out_non_private_arg_i32_1_use(ptr addrspace(1) %val) #0 {
+  store i32 0, ptr addrspace(1) %val
   ret void
 }
 
-; CHECK-LABEL: define private %bitcast_pointer_as1 @bitcast_pointer_as1.body(<3 x i32> addrspace(1)* %out) #0 {
-; CHECK-NEXT: %load = load volatile <4 x i32>, <4 x i32> addrspace(1)* poison
-; CHECK-NEXT: %bitcast = bitcast <3 x i32> addrspace(1)* %out to <4 x i32> addrspace(1)*
+; CHECK-LABEL: define private %bitcast_pointer_as1 @bitcast_pointer_as1.body(ptr addrspace(1) %out) #0 {
+; CHECK-NEXT: %load = load volatile <4 x i32>, ptr addrspace(1) poison
 ; CHECK-NEXT: %1 = insertvalue %bitcast_pointer_as1 poison, <4 x i32> %load, 0
 ; CHECK-NEXT: ret %bitcast_pointer_as1 %1
 
-; CHECK-LABEL: define void @bitcast_pointer_as1(<3 x i32> addrspace(1)* %0) #1 {
-; CHECK-NEXT: %2 = call %bitcast_pointer_as1 @bitcast_pointer_as1.body(<3 x i32> addrspace(1)* poison)
-define void @bitcast_pointer_as1(<3 x i32> addrspace(1)* %out) #0 {
-  %load = load volatile <4 x i32>, <4 x i32> addrspace(1)* poison
-  %bitcast = bitcast <3 x i32> addrspace(1)* %out to <4 x i32> addrspace(1)*
-  store <4 x i32> %load, <4 x i32> addrspace(1)* %bitcast
+; CHECK-LABEL: define void @bitcast_pointer_as1(ptr addrspace(1) %0) #1 {
+; CHECK-NEXT: %2 = call %bitcast_pointer_as1 @bitcast_pointer_as1.body(ptr addrspace(1) poison)
+define void @bitcast_pointer_as1(ptr addrspace(1) %out) #0 {
+  %load = load volatile <4 x i32>, ptr addrspace(1) poison
+  store <4 x i32> %load, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
index 44a617fa02cb..9d028602ad6f 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
@@ -6,114 +6,114 @@ define void @no_ret_blocks() #0 {
   unreachable
 }
 
-define void @void_one_out_arg_i32_no_use(i32* %val) #0 {
+define void @void_one_out_arg_i32_no_use(ptr %val) #0 {
   ret void
 }
 
-define void @skip_byval_arg(i32* byval(i32) %val) #0 {
-  store i32 0, i32* %val
+define void @skip_byval_arg(ptr byval(i32) %val) #0 {
+  store i32 0, ptr %val
   ret void
 }
 
-define void @skip_optnone(i32* byval(i32) %val) #1 {
-  store i32 0, i32* %val
+define void @skip_optnone(ptr byval(i32) %val) #1 {
+  store i32 0, ptr %val
   ret void
 }
 
-define void @skip_volatile(i32* byval(i32) %val) #0 {
-  store volatile i32 0, i32* %val
+define void @skip_volatile(ptr byval(i32) %val) #0 {
+  store volatile i32 0, ptr %val
   ret void
 }
 
-define void @skip_atomic(i32* byval(i32) %val) #0 {
-  store atomic i32 0, i32* %val seq_cst, align 4
+define void @skip_atomic(ptr byval(i32) %val) #0 {
+  store atomic i32 0, ptr %val seq_cst, align 4
   ret void
 }
 
-define void @skip_store_pointer_val(i32* %val) #0 {
-  store i32* %val, i32** poison
+define void @skip_store_pointer_val(ptr %val) #0 {
+  store ptr %val, ptr poison
   ret void
 }
 
-define void @skip_store_gep(i32* %val) #0 {
-  %gep = getelementptr inbounds i32, i32* %val, i32 1
-  store i32 0, i32* %gep
+define void @skip_store_gep(ptr %val) #0 {
+  %gep = getelementptr inbounds i32, ptr %val, i32 1
+  store i32 0, ptr %gep
   ret void
 }
 
-define void @skip_sret(i32* sret(i32) %sret, i32* %out) #0 {
-  store i32 1, i32* %sret
-  store i32 0, i32* %out
+define void @skip_sret(ptr sret(i32) %sret, ptr %out) #0 {
+  store i32 1, ptr %sret
+  store i32 0, ptr %out
   ret void
 }
 
 
-define void @void_one_out_arg_i32_1_use(i32* %val) #0 {
-  store i32 0, i32* %val
+define void @void_one_out_arg_i32_1_use(ptr %val) #0 {
+  store i32 0, ptr %val
   ret void
 }
 
 
-define void @void_one_out_arg_i32_1_use_align(i32* align 8 %val) #0 {
-  store i32 0, i32* %val, align 8
+define void @void_one_out_arg_i32_1_use_align(ptr align 8 %val) #0 {
+  store i32 0, ptr %val, align 8
   ret void
 }
 
 
 
 
-define void @void_one_out_arg_i32_2_use(i1 %arg0, i32* %val) #0 {
+define void @void_one_out_arg_i32_2_use(i1 %arg0, ptr %val) #0 {
   br i1 %arg0, label %ret0, label %ret1
 
 ret0:
-  store i32 0, i32* %val
+  store i32 0, ptr %val
   ret void
 
 ret1:
-  store i32 9, i32* %val
+  store i32 9, ptr %val
   ret void
 }
 
 declare void @may.clobber()
 
 
-define void @void_one_out_arg_i32_2_stores(i32* %val) #0 {
-  store i32 0, i32* %val
-  store i32 1, i32* %val
+define void @void_one_out_arg_i32_2_stores(ptr %val) #0 {
+  store i32 0, ptr %val
+  store i32 1, ptr %val
   ret void
 }
 
 
-define void @void_one_out_arg_i32_2_stores_clobber(i32* %val) #0 {
-  store i32 0, i32* %val
+define void @void_one_out_arg_i32_2_stores_clobber(ptr %val) #0 {
+  store i32 0, ptr %val
   call void @may.clobber()
-  store i32 1, i32* %val
+  store i32 1, ptr %val
   ret void
 }
 
 
-define void @void_one_out_arg_i32_call_may_clobber(i32* %val) #0 {
-  store i32 0, i32* %val
+define void @void_one_out_arg_i32_call_may_clobber(ptr %val) #0 {
+  store i32 0, ptr %val
   call void @may.clobber()
   ret void
 }
 
 
-define void @void_one_out_arg_i32_pre_call_may_clobber(i32* %val) #0 {
+define void @void_one_out_arg_i32_pre_call_may_clobber(ptr %val) #0 {
   call void @may.clobber()
-  store i32 0, i32* %val
+  store i32 0, ptr %val
   ret void
 }
 
-define void @void_one_out_arg_i32_reload(i32* %val) #0 {
-  store i32 0, i32* %val
-  %load = load i32, i32* %val, align 4
+define void @void_one_out_arg_i32_reload(ptr %val) #0 {
+  store i32 0, ptr %val
+  %load = load i32, ptr %val, align 4
   ret void
 }
 
-define void @void_one_out_arg_i32_store_in_
diff erent_block(i32* %out) #0 {
-  %load = load i32, i32 addrspace(1)* poison
-  store i32 0, i32* %out
+define void @void_one_out_arg_i32_store_in_
diff erent_block(ptr %out) #0 {
+  %load = load i32, ptr addrspace(1) poison
+  store i32 0, ptr %out
   br label %ret
 
 ret:
@@ -121,20 +121,20 @@ ret:
 }
 
 
-define void @unused_out_arg_one_branch(i1 %arg0, i32* %val) #0 {
+define void @unused_out_arg_one_branch(i1 %arg0, ptr %val) #0 {
   br i1 %arg0, label %ret0, label %ret1
 
 ret0:
   ret void
 
 ret1:
-  store i32 9, i32* %val
+  store i32 9, ptr %val
   ret void
 }
 
 
-define void @void_one_out_arg_v2i32_1_use(<2 x i32>* %val) #0 {
-  store <2 x i32> <i32 17, i32 9>, <2 x i32>* %val
+define void @void_one_out_arg_v2i32_1_use(ptr %val) #0 {
+  store <2 x i32> <i32 17, i32 9>, ptr %val
   ret void
 }
 
@@ -142,54 +142,54 @@ define void @void_one_out_arg_v2i32_1_use(<2 x i32>* %val) #0 {
 
 
 ; Normally this is split into element accesses which we don't handle.
-define void @void_one_out_arg_struct_1_use(%struct* %out) #0 {
-  store %struct { i32 9, i8 99, float 4.0 }, %struct* %out
+define void @void_one_out_arg_struct_1_use(ptr %out) #0 {
+  store %struct { i32 9, i8 99, float 4.0 }, ptr %out
   ret void
 }
 
 
-define i32 @i32_one_out_arg_i32_1_use(i32* %val) #0 {
-  store i32 24, i32* %val
+define i32 @i32_one_out_arg_i32_1_use(ptr %val) #0 {
+  store i32 24, ptr %val
   ret i32 9
 }
 
 
-define void @unused_
diff erent_type(i32* %arg0, float* nocapture %arg1) #0 {
-  store float 4.0, float* %arg1, align 4
+define void @unused_
diff erent_type(ptr %arg0, ptr nocapture %arg1) #0 {
+  store float 4.0, ptr %arg1, align 4
   ret void
 }
 
 
-define void @multiple_same_return_noalias(i32* noalias %out0, i32* noalias %out1) #0 {
-  store i32 1, i32* %out0, align 4
-  store i32 2, i32* %out1, align 4
+define void @multiple_same_return_noalias(ptr noalias %out0, ptr noalias %out1) #0 {
+  store i32 1, ptr %out0, align 4
+  store i32 2, ptr %out1, align 4
   ret void
 }
 
 
-define void @multiple_same_return_mayalias(i32* %out0, i32* %out1) #0 {
-  store i32 1, i32* %out0, align 4
-  store i32 2, i32* %out1, align 4
+define void @multiple_same_return_mayalias(ptr %out0, ptr %out1) #0 {
+  store i32 1, ptr %out0, align 4
+  store i32 2, ptr %out1, align 4
   ret void
 }
 
 
-define void @multiple_same_return_mayalias_order(i32* %out0, i32* %out1) #0 {
-  store i32 2, i32* %out1, align 4
-  store i32 1, i32* %out0, align 4
+define void @multiple_same_return_mayalias_order(ptr %out0, ptr %out1) #0 {
+  store i32 2, ptr %out1, align 4
+  store i32 1, ptr %out0, align 4
   ret void
 }
 
 ; Currently this fails to convert because the store won't be found if
 ; it isn't in the same block as the return.
-define i32 @store_in_entry_block(i1 %arg0, i32* %out) #0 {
+define i32 @store_in_entry_block(i1 %arg0, ptr %out) #0 {
 entry:
-  %val0 = load i32, i32 addrspace(1)* poison
-  store i32 %val0, i32* %out
+  %val0 = load i32, ptr addrspace(1) poison
+  store i32 %val0, ptr %out
   br i1 %arg0, label %if, label %endif
 
 if:
-  %val1 = load i32, i32 addrspace(1)* poison
+  %val1 = load i32, ptr addrspace(1) poison
   br label %endif
 
 endif:
@@ -198,8 +198,8 @@ endif:
 }
 
 
-define i1 @i1_one_out_arg_i32_1_use(i32* %val) #0 {
-  store i32 24, i32* %val
+define i1 @i1_one_out_arg_i32_1_use(ptr %val) #0 {
+  store i32 24, ptr %val
   ret i1 true
 }
 
@@ -207,99 +207,96 @@ define i1 @i1_one_out_arg_i32_1_use(i32* %val) #0 {
 ; incompatible with struct return types.
 
 
-define zeroext i1 @i1_zeroext_one_out_arg_i32_1_use(i32* %val) #0 {
-  store i32 24, i32* %val
+define zeroext i1 @i1_zeroext_one_out_arg_i32_1_use(ptr %val) #0 {
+  store i32 24, ptr %val
   ret i1 true
 }
 
 
-define signext i1 @i1_signext_one_out_arg_i32_1_use(i32* %val) #0 {
-  store i32 24, i32* %val
+define signext i1 @i1_signext_one_out_arg_i32_1_use(ptr %val) #0 {
+  store i32 24, ptr %val
   ret i1 true
 }
 
 
-define noalias i32 addrspace(1)* @p1i32_noalias_one_out_arg_i32_1_use(i32* %val) #0 {
-  store i32 24, i32* %val
-  ret i32 addrspace(1)* null
+define noalias ptr addrspace(1) @p1i32_noalias_one_out_arg_i32_1_use(ptr %val) #0 {
+  store i32 24, ptr %val
+  ret ptr addrspace(1) null
 }
 
-define void @void_one_out_non_private_arg_i32_1_use(i32 addrspace(1)* %val) #0 {
-  store i32 0, i32 addrspace(1)* %val
+define void @void_one_out_non_private_arg_i32_1_use(ptr addrspace(1) %val) #0 {
+  store i32 0, ptr addrspace(1) %val
   ret void
 }
 
-define void @func_ptr_type(void()** %out) #0 {
-  %func = load void()*, void()** poison
-  store void()* %func, void()** %out
+define void @func_ptr_type(ptr %out) #0 {
+  %func = load ptr, ptr poison
+  store ptr %func, ptr %out
   ret void
 }
 
-define void @bitcast_func_ptr_type(void()** %out) #0 {
-  %func = load i32()*, i32()** poison
-  %cast = bitcast void()** %out to i32()**
-  store i32()* %func, i32()** %cast
+define void @bitcast_func_ptr_type(ptr %out) #0 {
+  %func = load ptr, ptr poison
+  store ptr %func, ptr %out
   ret void
 }
 
 
-define void @out_arg_small_array([4 x i32]* %val) #0 {
-  store [4 x i32] [i32 0, i32 1, i32 2, i32 3], [4 x i32]* %val
+define void @out_arg_small_array(ptr %val) #0 {
+  store [4 x i32] [i32 0, i32 1, i32 2, i32 3], ptr %val
   ret void
 }
 
-define void @out_arg_large_array([17 x i32]* %val) #0 {
-  store [17 x i32] zeroinitializer, [17 x i32]* %val
+define void @out_arg_large_array(ptr %val) #0 {
+  store [17 x i32] zeroinitializer, ptr %val
   ret void
 }
 
-define <16 x i32> @num_regs_return_limit(i32* %out, i32 %val) #0 {
-  %load = load volatile <16 x i32>, <16 x i32> addrspace(1)* poison
-  store i32 %val, i32* %out
+define <16 x i32> @num_regs_return_limit(ptr %out, i32 %val) #0 {
+  %load = load volatile <16 x i32>, ptr addrspace(1) poison
+  store i32 %val, ptr %out
   ret <16 x i32> %load
 }
 
-define [15 x i32] @num_regs_reach_limit(i32* %out, i32 %val) #0 {
-  %load = load volatile [15 x i32], [15 x i32] addrspace(1)* poison
-  store i32 %val, i32* %out
+define [15 x i32] @num_regs_reach_limit(ptr %out, i32 %val) #0 {
+  %load = load volatile [15 x i32], ptr addrspace(1) poison
+  store i32 %val, ptr %out
   ret [15 x i32] %load
 }
 
 
-define [15 x i32] @num_regs_reach_limit_leftover(i32* %out0, i32* %out1, i32 %val0) #0 {
-  %load0 = load volatile [15 x i32], [15 x i32] addrspace(1)* poison
-  %load1 = load volatile i32, i32 addrspace(1)* poison
-  store i32 %val0, i32* %out0
-  store i32 %load1, i32* %out1
+define [15 x i32] @num_regs_reach_limit_leftover(ptr %out0, ptr %out1, i32 %val0) #0 {
+  %load0 = load volatile [15 x i32], ptr addrspace(1) poison
+  %load1 = load volatile i32, ptr addrspace(1) poison
+  store i32 %val0, ptr %out0
+  store i32 %load1, ptr %out1
   ret [15 x i32] %load0
 }
 
 
-define void @preserve_debug_info(i32 %arg0, i32* %val) #0 !dbg !5 {
+define void @preserve_debug_info(i32 %arg0, ptr %val) #0 !dbg !5 {
   call void @may.clobber(), !dbg !10
-  store i32 %arg0, i32* %val, !dbg !11
+  store i32 %arg0, ptr %val, !dbg !11
   ret void, !dbg !12
 }
 
-define void @preserve_metadata(i32 %arg0, i32* %val) #0 !kernel_arg_access_qual !13 {
+define void @preserve_metadata(i32 %arg0, ptr %val) #0 !kernel_arg_access_qual !13 {
   call void @may.clobber()
-  store i32 %arg0, i32* %val
+  store i32 %arg0, ptr %val
   ret void
 }
 
 ; Clang emits this pattern for 3-vectors for some reason.
 
-define void @bitcast_pointer_v4i32_v3i32(<3 x i32>* %out) #0 {
-  %load = load volatile <4 x i32>, <4 x i32> addrspace(1)* poison
-  %bitcast = bitcast <3 x i32>* %out to <4 x i32>*
-  store <4 x i32> %load, <4 x i32>* %bitcast
+define void @bitcast_pointer_v4i32_v3i32(ptr %out) #0 {
+  %load = load volatile <4 x i32>, ptr addrspace(1) poison
+  store <4 x i32> %load, ptr %out
   ret void
 }
 
-define void @bitcast_pointer_v4i32_v3f32(<3 x float>* %out) #0 {
-  %load = load volatile <4 x i32>, <4 x i32> addrspace(1)* poison
-  %bitcast = bitcast <3 x float>* %out to <4 x i32>*
-  store <4 x i32> %load, <4 x i32>* %bitcast
+define void @bitcast_pointer_v4i32_v3f32(ptr %out) #0 {
+  %load = load volatile <4 x i32>, ptr addrspace(1) poison
+  store <4 x i32> %load, ptr %out
   ret void
 }
 
@@ -308,24 +305,21 @@ define void @bitcast_pointer_v4i32_v3f32(<3 x float>* %out) #0 {
 ; casts.
 
 
-define void @bitcast_pointer_i32_f32(float* %out) #0 {
-  %load = load volatile i32, i32 addrspace(1)* poison
-  %bitcast = bitcast float* %out to i32*
-  store i32 %load, i32* %bitcast
+define void @bitcast_pointer_i32_f32(ptr %out) #0 {
+  %load = load volatile i32, ptr addrspace(1) poison
+  store i32 %load, ptr %out
   ret void
 }
 
-define void @bitcast_pointer_i32_f16(half* %out) #0 {
-  %load = load volatile i32, i32 addrspace(1)* poison
-  %bitcast = bitcast half* %out to i32*
-  store i32 %load, i32* %bitcast
+define void @bitcast_pointer_i32_f16(ptr %out) #0 {
+  %load = load volatile i32, ptr addrspace(1) poison
+  store i32 %load, ptr %out
   ret void
 }
 
-define void @bitcast_pointer_f16_i32(i32* %out) #0 {
-  %load = load volatile half, half addrspace(1)* poison
-  %bitcast = bitcast i32* %out to half*
-  store half %load, half* %bitcast
+define void @bitcast_pointer_f16_i32(ptr %out) #0 {
+  %load = load volatile half, ptr addrspace(1) poison
+  store half %load, ptr %out
   ret void
 }
 
@@ -336,93 +330,80 @@ define void @bitcast_pointer_f16_i32(i32* %out) #0 {
 %struct.v4f32 = type { <4 x float> }
 
 
-define void @bitcast_struct_v3f32_v3f32(%struct.v3f32* %out, <3 x float> %value) #0 {
+define void @bitcast_struct_v3f32_v3f32(ptr %out, <3 x float> %value) #0 {
   %extractVec = shufflevector <3 x float> %value, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  %cast = bitcast %struct.v3f32* %out to <4 x float>*
-  store <4 x float> %extractVec, <4 x float>* %cast, align 16
+  store <4 x float> %extractVec, ptr %out, align 16
   ret void
 }
 
 
-define void @bitcast_struct_v3f32_v3i32(%struct.v3f32* %out, <3 x i32> %value) #0 {
+define void @bitcast_struct_v3f32_v3i32(ptr %out, <3 x i32> %value) #0 {
   %extractVec = shufflevector <3 x i32> %value, <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  %cast = bitcast %struct.v3f32* %out to <4 x i32>*
-  store <4 x i32> %extractVec, <4 x i32>* %cast, align 16
+  store <4 x i32> %extractVec, ptr %out, align 16
   ret void
 }
 
 
-define void @bitcast_struct_v4f32_v4f32(%struct.v4f32* %out, <4 x float> %value) #0 {
-  %cast = bitcast %struct.v4f32* %out to <4 x float>*
-  store <4 x float> %value, <4 x float>* %cast, align 16
+define void @bitcast_struct_v4f32_v4f32(ptr %out, <4 x float> %value) #0 {
+  store <4 x float> %value, ptr %out, align 16
   ret void
 }
 
-define void @bitcast_struct_v3f32_v4i32(%struct.v3f32* %out, <4 x i32> %value) #0 {
-  %cast = bitcast %struct.v3f32* %out to <4 x i32>*
-  store <4 x i32> %value, <4 x i32>* %cast, align 16
+define void @bitcast_struct_v3f32_v4i32(ptr %out, <4 x i32> %value) #0 {
+  store <4 x i32> %value, ptr %out, align 16
   ret void
 }
 
-define void @bitcast_struct_v4f32_v3f32(%struct.v4f32* %out, <3 x float> %value) #0 {
+define void @bitcast_struct_v4f32_v3f32(ptr %out, <3 x float> %value) #0 {
   %extractVec = shufflevector <3 x float> %value, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  %cast = bitcast %struct.v4f32* %out to <4 x float>*
-  store <4 x float> %extractVec, <4 x float>* %cast, align 16
+  store <4 x float> %extractVec, ptr %out, align 16
   ret void
 }
 
-define void @bitcast_struct_v3f32_v2f32(%struct.v3f32* %out, <2 x float> %value) #0 {
-  %cast = bitcast %struct.v3f32* %out to <2 x float>*
-  store <2 x float> %value, <2 x float>* %cast, align 8
+define void @bitcast_struct_v3f32_v2f32(ptr %out, <2 x float> %value) #0 {
+  store <2 x float> %value, ptr %out, align 8
   ret void
 }
 
-define void @bitcast_struct_v3f32_f32_v3f32(%struct.v3f32.f32* %out, <3 x float> %value) #0 {
+define void @bitcast_struct_v3f32_f32_v3f32(ptr %out, <3 x float> %value) #0 {
   %extractVec = shufflevector <3 x float> %value, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  %cast = bitcast %struct.v3f32.f32* %out to <4 x float>*
-  store <4 x float> %extractVec, <4 x float>* %cast, align 16
+  store <4 x float> %extractVec, ptr %out, align 16
   ret void
 }
 
-define void @bitcast_struct_v3f32_f32_v4f32(%struct.v3f32.f32* %out, <4 x float> %value) #0 {
-  %cast = bitcast %struct.v3f32.f32* %out to <4 x float>*
-  store <4 x float> %value, <4 x float>* %cast, align 16
+define void @bitcast_struct_v3f32_f32_v4f32(ptr %out, <4 x float> %value) #0 {
+  store <4 x float> %value, ptr %out, align 16
   ret void
 }
 
-define void @bitcast_struct_i128_v4f32(%struct.i128* %out, <4 x float> %value) #0 {
-  %cast = bitcast %struct.i128* %out to <4 x float>*
-  store <4 x float> %value, <4 x float>* %cast, align 16
+define void @bitcast_struct_i128_v4f32(ptr %out, <4 x float> %value) #0 {
+  store <4 x float> %value, ptr %out, align 16
   ret void
 }
 
-define void @bitcast_array_v4i32_v4f32([4 x i32]* %out, [4 x float] %value) #0 {
-  %cast = bitcast [4 x i32]* %out to [4 x float]*
-  store [4 x float] %value, [4 x float]* %cast, align 4
+define void @bitcast_array_v4i32_v4f32(ptr %out, [4 x float] %value) #0 {
+  store [4 x float] %value, ptr %out, align 4
   ret void
 }
 
 
-define void @multi_return_bitcast_struct_v3f32_v3f32(i1 %cond, %struct.v3f32* %out, <3 x float> %value) #0 {
+define void @multi_return_bitcast_struct_v3f32_v3f32(i1 %cond, ptr %out, <3 x float> %value) #0 {
 entry:
   br i1 %cond, label %ret0, label %ret1
 
 ret0:
   %extractVec = shufflevector <3 x float> %value, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  %cast0 = bitcast %struct.v3f32* %out to <4 x float>*
-  store <4 x float> %extractVec, <4 x float>* %cast0, align 16
+  store <4 x float> %extractVec, ptr %out, align 16
   ret void
 
 ret1:
-  %cast1 = bitcast %struct.v3f32* %out to <4 x float>*
-  %load = load <4 x float>, <4 x float> addrspace(1)* poison
-  store <4 x float> %load, <4 x float>* %cast1, align 16
+  %load = load <4 x float>, ptr addrspace(1) poison
+  store <4 x float> %load, ptr %out, align 16
   ret void
 }
 
-define void @bitcast_v3f32_struct_v3f32(<3 x float>* %out, %struct.v3f32 %value) #0 {
-  %cast = bitcast <3 x float>* %out to %struct.v3f32*
-  store %struct.v3f32 %value, %struct.v3f32* %cast, align 4
+define void @bitcast_v3f32_struct_v3f32(ptr %out, %struct.v3f32 %value) #0 {
+  store %struct.v3f32 %value, ptr %out, align 4
   ret void
 }
 
@@ -454,82 +435,82 @@ attributes #2 = { alwaysinline nounwind }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_no_use
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_byval_arg
-; CHECK-SAME: (i32* byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, i32* [[VAL]], align 4
+; CHECK-SAME: (ptr byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_optnone
-; CHECK-SAME: (i32* byval(i32) [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    store i32 0, i32* [[VAL]], align 4
+; CHECK-SAME: (ptr byval(i32) [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_volatile
-; CHECK-SAME: (i32* byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store volatile i32 0, i32* [[VAL]], align 4
+; CHECK-SAME: (ptr byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store volatile i32 0, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_atomic
-; CHECK-SAME: (i32* byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store atomic i32 0, i32* [[VAL]] seq_cst, align 4
+; CHECK-SAME: (ptr byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store atomic i32 0, ptr [[VAL]] seq_cst, align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_store_pointer_val
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32* [[VAL]], i32** poison, align 8
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store ptr [[VAL]], ptr poison, align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_store_gep
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[VAL]], i32 1
-; CHECK-NEXT:    store i32 0, i32* [[GEP]], align 4
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[VAL]], i32 1
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_sret
-; CHECK-SAME: (i32* sret(i32) [[SRET:%.*]], i32* [[OUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 1, i32* [[SRET]], align 4
-; CHECK-NEXT:    store i32 0, i32* [[OUT]], align 4
+; CHECK-SAME: (ptr sret(i32) [[SRET:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 1, ptr [[SRET]], align 4
+; CHECK-NEXT:    store i32 0, ptr [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use.body
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_1_USE:%.*]] zeroinitializer
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use
-; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_1_USE:%.*]] @void_one_out_arg_i32_1_use.body(i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_1_USE:%.*]] @void_one_out_arg_i32_1_use.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use_align.body
-; CHECK-SAME: (i32* align 8 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr align 8 [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_1_USE_ALIGN:%.*]] zeroinitializer
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use_align
-; CHECK-SAME: (i32* align 8 [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_1_USE_ALIGN:%.*]] @void_one_out_arg_i32_1_use_align.body(i32* poison)
+; CHECK-SAME: (ptr align 8 [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_1_USE_ALIGN:%.*]] @void_one_out_arg_i32_1_use_align.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_1_USE_ALIGN]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP0]], align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_use.body
-; CHECK-SAME: (i1 [[ARG0:%.*]], i32* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i1 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    br i1 [[ARG0]], label [[RET0:%.*]], label [[RET1:%.*]]
 ; CHECK:       ret0:
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_2_USE:%.*]] zeroinitializer
@@ -538,195 +519,195 @@ attributes #2 = { alwaysinline nounwind }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_use
-; CHECK-SAME: (i1 [[TMP0:%.*]], i32* [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_USE:%.*]] @void_one_out_arg_i32_2_use.body(i1 [[TMP0]], i32* poison)
+; CHECK-SAME: (i1 [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_USE:%.*]] @void_one_out_arg_i32_2_use.body(i1 [[TMP0]], ptr poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_2_USE]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores.body
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, i32* [[VAL]], align 4
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_2_STORES:%.*]] { i32 1 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores
-; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_STORES:%.*]] @void_one_out_arg_i32_2_stores.body(i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_STORES:%.*]] @void_one_out_arg_i32_2_stores.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_2_STORES]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores_clobber.body
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, i32* [[VAL]], align 4
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
 ; CHECK-NEXT:    call void @may.clobber()
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_2_STORES_CLOBBER:%.*]] { i32 1 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores_clobber
-; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_STORES_CLOBBER:%.*]] @void_one_out_arg_i32_2_stores_clobber.body(i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_STORES_CLOBBER:%.*]] @void_one_out_arg_i32_2_stores_clobber.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_2_STORES_CLOBBER]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_call_may_clobber
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, i32* [[VAL]], align 4
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
 ; CHECK-NEXT:    call void @may.clobber()
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_pre_call_may_clobber.body
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @may.clobber()
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_PRE_CALL_MAY_CLOBBER:%.*]] zeroinitializer
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_pre_call_may_clobber
-; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_PRE_CALL_MAY_CLOBBER:%.*]] @void_one_out_arg_i32_pre_call_may_clobber.body(i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_PRE_CALL_MAY_CLOBBER:%.*]] @void_one_out_arg_i32_pre_call_may_clobber.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_PRE_CALL_MAY_CLOBBER]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_reload
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, i32* [[VAL]], align 4
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[VAL]], align 4
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_store_in_
diff erent_block
-; CHECK-SAME: (i32* [[OUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32 addrspace(1)* poison, align 4
-; CHECK-NEXT:    store i32 0, i32* [[OUT]], align 4
+; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) poison, align 4
+; CHECK-NEXT:    store i32 0, ptr [[OUT]], align 4
 ; CHECK-NEXT:    br label [[RET:%.*]]
 ; CHECK:       ret:
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@unused_out_arg_one_branch
-; CHECK-SAME: (i1 [[ARG0:%.*]], i32* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i1 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    br i1 [[ARG0]], label [[RET0:%.*]], label [[RET1:%.*]]
 ; CHECK:       ret0:
 ; CHECK-NEXT:    ret void
 ; CHECK:       ret1:
-; CHECK-NEXT:    store i32 9, i32* [[VAL]], align 4
+; CHECK-NEXT:    store i32 9, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_v2i32_1_use.body
-; CHECK-SAME: (<2 x i32>* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_V2I32_1_USE:%.*]] { <2 x i32> <i32 17, i32 9> }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_v2i32_1_use
-; CHECK-SAME: (<2 x i32>* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_V2I32_1_USE:%.*]] @void_one_out_arg_v2i32_1_use.body(<2 x i32>* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_V2I32_1_USE:%.*]] @void_one_out_arg_v2i32_1_use.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_V2I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    store <2 x i32> [[TMP3]], <2 x i32>* [[TMP0]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_struct_1_use.body
-; CHECK-SAME: (%struct* [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_STRUCT_1_USE:%.*]] { [[STRUCT:%.*]] { i32 9, i8 99, float 4.000000e+00 } }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_struct_1_use
-; CHECK-SAME: (%struct* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_STRUCT_1_USE:%.*]] @void_one_out_arg_struct_1_use.body(%struct* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_STRUCT_1_USE:%.*]] @void_one_out_arg_struct_1_use.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_STRUCT_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    store [[STRUCT:%.*]] [[TMP3]], %struct* [[TMP0]], align 4
+; CHECK-NEXT:    store [[STRUCT:%.*]] [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i32_one_out_arg_i32_1_use.body
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[I32_ONE_OUT_ARG_I32_1_USE:%.*]] { i32 9, i32 24 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i32_one_out_arg_i32_1_use
-; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I32_ONE_OUT_ARG_I32_1_USE:%.*]] @i32_one_out_arg_i32_1_use.body(i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[I32_ONE_OUT_ARG_I32_1_USE:%.*]] @i32_one_out_arg_i32_1_use.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I32_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I32_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
 ; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@unused_
diff erent_type.body
-; CHECK-SAME: (i32* [[ARG0:%.*]], float* nocapture [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[ARG0:%.*]], ptr nocapture [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[UNUSED_DIFFERENT_TYPE:%.*]] { float 4.000000e+00 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@unused_
diff erent_type
-; CHECK-SAME: (i32* [[TMP0:%.*]], float* nocapture [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[UNUSED_DIFFERENT_TYPE:%.*]] @unused_
diff erent_type.body(i32* [[TMP0]], float* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]], ptr nocapture [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[UNUSED_DIFFERENT_TYPE:%.*]] @unused_
diff erent_type.body(ptr [[TMP0]], ptr poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[UNUSED_DIFFERENT_TYPE]] [[TMP3]], 0
-; CHECK-NEXT:    store float [[TMP4]], float* [[TMP1]], align 4
+; CHECK-NEXT:    store float [[TMP4]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_noalias.body
-; CHECK-SAME: (i32* noalias [[OUT0:%.*]], i32* noalias [[OUT1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr noalias [[OUT0:%.*]], ptr noalias [[OUT1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[MULTIPLE_SAME_RETURN_NOALIAS:%.*]] { i32 1, i32 2 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_noalias
-; CHECK-SAME: (i32* noalias [[TMP0:%.*]], i32* noalias [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_NOALIAS:%.*]] @multiple_same_return_noalias.body(i32* poison, i32* poison)
+; CHECK-SAME: (ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_NOALIAS:%.*]] @multiple_same_return_noalias.body(ptr poison, ptr poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_NOALIAS]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_NOALIAS]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias.body
-; CHECK-SAME: (i32* [[OUT0:%.*]], i32* [[OUT1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[MULTIPLE_SAME_RETURN_MAYALIAS:%.*]] { i32 2, i32 1 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias
-; CHECK-SAME: (i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_MAYALIAS:%.*]] @multiple_same_return_mayalias.body(i32* poison, i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_MAYALIAS:%.*]] @multiple_same_return_mayalias.body(ptr poison, ptr poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias_order.body
-; CHECK-SAME: (i32* [[OUT0:%.*]], i32* [[OUT1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER:%.*]] { i32 1, i32 2 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias_order
-; CHECK-SAME: (i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER:%.*]] @multiple_same_return_mayalias_order.body(i32* poison, i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER:%.*]] @multiple_same_return_mayalias_order.body(ptr poison, ptr poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@store_in_entry_block
-; CHECK-SAME: (i1 [[ARG0:%.*]], i32* [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i1 [[ARG0:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VAL0:%.*]] = load i32, i32 addrspace(1)* poison, align 4
-; CHECK-NEXT:    store i32 [[VAL0]], i32* [[OUT]], align 4
+; CHECK-NEXT:    [[VAL0:%.*]] = load i32, ptr addrspace(1) poison, align 4
+; CHECK-NEXT:    store i32 [[VAL0]], ptr [[OUT]], align 4
 ; CHECK-NEXT:    br i1 [[ARG0]], label [[IF:%.*]], label [[ENDIF:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[VAL1:%.*]] = load i32, i32 addrspace(1)* poison, align 4
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr addrspace(1) poison, align 4
 ; CHECK-NEXT:    br label [[ENDIF]]
 ; CHECK:       endif:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL1]], [[IF]] ]
@@ -734,146 +715,144 @@ attributes #2 = { alwaysinline nounwind }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_one_out_arg_i32_1_use.body
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[I1_ONE_OUT_ARG_I32_1_USE:%.*]] { i1 true, i32 24 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_one_out_arg_i32_1_use
-; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_one_out_arg_i32_1_use.body(i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_one_out_arg_i32_1_use.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I1_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I1_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
 ; CHECK-NEXT:    ret i1 [[TMP4]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_zeroext_one_out_arg_i32_1_use.body
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE:%.*]] { i1 true, i32 24 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_zeroext_one_out_arg_i32_1_use
-; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_zeroext_one_out_arg_i32_1_use.body(i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_zeroext_one_out_arg_i32_1_use.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
 ; CHECK-NEXT:    ret i1 [[TMP4]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_signext_one_out_arg_i32_1_use.body
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE:%.*]] { i1 true, i32 24 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_signext_one_out_arg_i32_1_use
-; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_signext_one_out_arg_i32_1_use.body(i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_signext_one_out_arg_i32_1_use.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
 ; CHECK-NEXT:    ret i1 [[TMP4]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@p1i32_noalias_one_out_arg_i32_1_use.body
-; CHECK-SAME: (i32* [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    ret [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE:%.*]] { i32 addrspace(1)* null, i32 24 }
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE:%.*]] { ptr addrspace(1) null, i32 24 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@p1i32_noalias_one_out_arg_i32_1_use
-; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE:%.*]] @p1i32_noalias_one_out_arg_i32_1_use.body(i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE:%.*]] @p1i32_noalias_one_out_arg_i32_1_use.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    ret i32 addrspace(1)* [[TMP4]]
+; CHECK-NEXT:    ret ptr addrspace(1) [[TMP4]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_non_private_arg_i32_1_use
-; CHECK-SAME: (i32 addrspace(1)* [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[VAL]], align 4
+; CHECK-SAME: (ptr addrspace(1) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@func_ptr_type.body
-; CHECK-SAME: (void ()** [[OUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[FUNC:%.*]] = load void ()*, void ()** poison, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[FUNC_PTR_TYPE:%.*]] poison, void ()* [[FUNC]], 0
+; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[FUNC:%.*]] = load ptr, ptr poison, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[FUNC_PTR_TYPE:%.*]] poison, ptr [[FUNC]], 0
 ; CHECK-NEXT:    ret [[FUNC_PTR_TYPE]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@func_ptr_type
-; CHECK-SAME: (void ()** [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[FUNC_PTR_TYPE:%.*]] @func_ptr_type.body(void ()** poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[FUNC_PTR_TYPE:%.*]] @func_ptr_type.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[FUNC_PTR_TYPE]] [[TMP2]], 0
-; CHECK-NEXT:    store void ()* [[TMP3]], void ()** [[TMP0]], align 8
+; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_func_ptr_type.body
-; CHECK-SAME: (void ()** [[OUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[FUNC:%.*]] = load i32 ()*, i32 ()** poison, align 8
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast void ()** [[OUT]] to i32 ()**
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_FUNC_PTR_TYPE:%.*]] poison, i32 ()* [[FUNC]], 0
+; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[FUNC:%.*]] = load ptr, ptr poison, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_FUNC_PTR_TYPE:%.*]] poison, ptr [[FUNC]], 0
 ; CHECK-NEXT:    ret [[BITCAST_FUNC_PTR_TYPE]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_func_ptr_type
-; CHECK-SAME: (void ()** [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_FUNC_PTR_TYPE:%.*]] @bitcast_func_ptr_type.body(void ()** poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_FUNC_PTR_TYPE:%.*]] @bitcast_func_ptr_type.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_FUNC_PTR_TYPE]] [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast void ()** [[TMP0]] to i32 ()**
-; CHECK-NEXT:    store i32 ()* [[TMP3]], i32 ()** [[TMP4]], align 8
+; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@out_arg_small_array.body
-; CHECK-SAME: ([4 x i32]* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[OUT_ARG_SMALL_ARRAY:%.*]] { [4 x i32] [i32 0, i32 1, i32 2, i32 3] }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@out_arg_small_array
-; CHECK-SAME: ([4 x i32]* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[OUT_ARG_SMALL_ARRAY:%.*]] @out_arg_small_array.body([4 x i32]* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[OUT_ARG_SMALL_ARRAY:%.*]] @out_arg_small_array.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[OUT_ARG_SMALL_ARRAY]] [[TMP2]], 0
-; CHECK-NEXT:    store [4 x i32] [[TMP3]], [4 x i32]* [[TMP0]], align 4
+; CHECK-NEXT:    store [4 x i32] [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@out_arg_large_array
-; CHECK-SAME: ([17 x i32]* [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store [17 x i32] zeroinitializer, [17 x i32]* [[VAL]], align 4
+; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store [17 x i32] zeroinitializer, ptr [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@num_regs_return_limit
-; CHECK-SAME: (i32* [[OUT:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <16 x i32>, <16 x i32> addrspace(1)* poison, align 64
-; CHECK-NEXT:    store i32 [[VAL]], i32* [[OUT]], align 4
+; CHECK-SAME: (ptr [[OUT:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <16 x i32>, ptr addrspace(1) poison, align 64
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[OUT]], align 4
 ; CHECK-NEXT:    ret <16 x i32> [[LOAD]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit.body
-; CHECK-SAME: (i32* [[OUT:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load volatile [15 x i32], [15 x i32] addrspace(1)* poison, align 4
+; CHECK-SAME: (ptr [[OUT:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile [15 x i32], ptr addrspace(1) poison, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT:%.*]] poison, [15 x i32] [[LOAD]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT]] [[TMP1]], i32 [[VAL]], 1
 ; CHECK-NEXT:    ret [[NUM_REGS_REACH_LIMIT]] [[TMP2]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit
-; CHECK-SAME: (i32* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[NUM_REGS_REACH_LIMIT:%.*]] @num_regs_reach_limit.body(i32* poison, i32 [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[NUM_REGS_REACH_LIMIT:%.*]] @num_regs_reach_limit.body(ptr poison, i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT]] [[TMP3]], 0
 ; CHECK-NEXT:    ret [15 x i32] [[TMP5]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit_leftover.body
-; CHECK-SAME: (i32* [[OUT0:%.*]], i32* [[OUT1:%.*]], i32 [[VAL0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD0:%.*]] = load volatile [15 x i32], [15 x i32] addrspace(1)* poison, align 4
-; CHECK-NEXT:    [[LOAD1:%.*]] = load volatile i32, i32 addrspace(1)* poison, align 4
+; CHECK-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], i32 [[VAL0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LOAD0:%.*]] = load volatile [15 x i32], ptr addrspace(1) poison, align 4
+; CHECK-NEXT:    [[LOAD1:%.*]] = load volatile i32, ptr addrspace(1) poison, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER:%.*]] poison, [15 x i32] [[LOAD0]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP1]], i32 [[LOAD1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP2]], i32 [[VAL0]], 2
@@ -881,332 +860,297 @@ attributes #2 = { alwaysinline nounwind }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit_leftover
-; CHECK-SAME: (i32* [[TMP0:%.*]], i32* [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = call [[NUM_REGS_REACH_LIMIT_LEFTOVER:%.*]] @num_regs_reach_limit_leftover.body(i32* poison, i32* poison, i32 [[TMP2]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = call [[NUM_REGS_REACH_LIMIT_LEFTOVER:%.*]] @num_regs_reach_limit_leftover.body(ptr poison, ptr poison, i32 [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP4]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP4]], 2
-; CHECK-NEXT:    store i32 [[TMP6]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP6]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP4]], 0
 ; CHECK-NEXT:    ret [15 x i32] [[TMP7]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@preserve_debug_info.body
-; CHECK-SAME: (i32 [[ARG0:%.*]], i32* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i32 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @may.clobber(), !dbg [[DBG5:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[PRESERVE_DEBUG_INFO:%.*]] poison, i32 [[ARG0]], 0, !dbg [[DBG11:![0-9]+]]
 ; CHECK-NEXT:    ret [[PRESERVE_DEBUG_INFO]] [[TMP1]], !dbg [[DBG11]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@preserve_debug_info
-; CHECK-SAME: (i32 [[TMP0:%.*]], i32* [[TMP1:%.*]]) #[[ATTR2]] !dbg [[DBG6:![0-9]+]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[PRESERVE_DEBUG_INFO:%.*]] @preserve_debug_info.body(i32 [[TMP0]], i32* poison)
+; CHECK-SAME: (i32 [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] !dbg [[DBG6:![0-9]+]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[PRESERVE_DEBUG_INFO:%.*]] @preserve_debug_info.body(i32 [[TMP0]], ptr poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[PRESERVE_DEBUG_INFO]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@preserve_metadata.body
-; CHECK-SAME: (i32 [[ARG0:%.*]], i32* [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i32 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @may.clobber()
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[PRESERVE_METADATA:%.*]] poison, i32 [[ARG0]], 0
 ; CHECK-NEXT:    ret [[PRESERVE_METADATA]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@preserve_metadata
-; CHECK-SAME: (i32 [[TMP0:%.*]], i32* [[TMP1:%.*]]) #[[ATTR2]] !kernel_arg_access_qual !12 {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[PRESERVE_METADATA:%.*]] @preserve_metadata.body(i32 [[TMP0]], i32* poison)
+; CHECK-SAME: (i32 [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] !kernel_arg_access_qual !12 {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[PRESERVE_METADATA:%.*]] @preserve_metadata.body(i32 [[TMP0]], ptr poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[PRESERVE_METADATA]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3i32.body
-; CHECK-SAME: (<3 x i32>* [[OUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, <4 x i32> addrspace(1)* poison, align 16
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <3 x i32>* [[OUT]] to <4 x i32>*
+; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, ptr addrspace(1) poison, align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3I32:%.*]] poison, <4 x i32> [[LOAD]], 0
 ; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3i32
-; CHECK-SAME: (<3 x i32>* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3I32:%.*]] @bitcast_pointer_v4i32_v3i32.body(<3 x i32>* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3I32:%.*]] @bitcast_pointer_v4i32_v3i32.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3I32]] [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <3 x i32>* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3f32.body
-; CHECK-SAME: (<3 x float>* [[OUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, <4 x i32> addrspace(1)* poison, align 16
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <3 x float>* [[OUT]] to <4 x i32>*
+; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, ptr addrspace(1) poison, align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3F32:%.*]] poison, <4 x i32> [[LOAD]], 0
 ; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3f32
-; CHECK-SAME: (<3 x float>* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3F32:%.*]] @bitcast_pointer_v4i32_v3f32.body(<3 x float>* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3F32:%.*]] @bitcast_pointer_v4i32_v3f32.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3F32]] [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <3 x float>* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f32.body
-; CHECK-SAME: (float* [[OUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* poison, align 4
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float* [[OUT]] to i32*
+; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) poison, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F32:%.*]] poison, i32 [[LOAD]], 0
 ; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f32
-; CHECK-SAME: (float* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F32:%.*]] @bitcast_pointer_i32_f32.body(float* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F32:%.*]] @bitcast_pointer_i32_f32.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F32]] [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP0]] to i32*
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16.body
-; CHECK-SAME: (half* [[OUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* poison, align 4
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast half* [[OUT]] to i32*
+; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) poison, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F16:%.*]] poison, i32 [[LOAD]], 0
 ; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F16]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16
-; CHECK-SAME: (half* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F16:%.*]] @bitcast_pointer_i32_f16.body(half* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F16:%.*]] @bitcast_pointer_i32_f16.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F16]] [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast half* [[TMP0]] to i32*
-; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32.body
-; CHECK-SAME: (i32* [[OUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LOAD:%.*]] = load volatile half, half addrspace(1)* poison, align 2
-; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[OUT]] to half*
+; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LOAD:%.*]] = load volatile half, ptr addrspace(1) poison, align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_F16_I32:%.*]] poison, half [[LOAD]], 0
 ; CHECK-NEXT:    ret [[BITCAST_POINTER_F16_I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32
-; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_F16_I32:%.*]] @bitcast_pointer_f16_i32.body(i32* poison)
+; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_F16_I32:%.*]] @bitcast_pointer_f16_i32.body(ptr poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_F16_I32]] [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP0]] to half*
-; CHECK-NEXT:    store half [[TMP3]], half* [[TMP4]], align 2
+; CHECK-NEXT:    store half [[TMP3]], ptr [[TMP0]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3f32.body
-; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3F32:%.*]] poison, <4 x float> [[EXTRACTVEC]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3f32
-; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3F32:%.*]] @bitcast_struct_v3f32_v3f32.body(%struct.v3f32* poison, <3 x float> [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3F32:%.*]] @bitcast_struct_v3f32_v3f32.body(ptr poison, <3 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3i32.body
-; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <3 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[OUT:%.*]], <3 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[VALUE]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3I32:%.*]] poison, <4 x i32> [[EXTRACTVEC]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3i32
-; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <3 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3I32:%.*]] @bitcast_struct_v3f32_v3i32.body(%struct.v3f32* poison, <3 x i32> [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3I32:%.*]] @bitcast_struct_v3f32_v3i32.body(ptr poison, <3 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32.body
-; CHECK-SAME: (%struct.v4f32* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v4f32* [[OUT]] to <4 x float>*
+; CHECK-SAME: (ptr [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V4F32:%.*]] poison, <4 x float> [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32
-; CHECK-SAME: (%struct.v4f32* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V4F32:%.*]] @bitcast_struct_v4f32_v4f32.body(%struct.v4f32* poison, <4 x float> [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V4F32:%.*]] @bitcast_struct_v4f32_v4f32.body(ptr poison, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v4f32* [[TMP0]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32.body
-; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <4 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x i32>*
+; CHECK-SAME: (ptr [[OUT:%.*]], <4 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V4I32:%.*]] poison, <4 x i32> [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32
-; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V4I32:%.*]] @bitcast_struct_v3f32_v4i32.body(%struct.v3f32* poison, <4 x i32> [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V4I32:%.*]] @bitcast_struct_v3f32_v4i32.body(ptr poison, <4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v3f32.body
-; CHECK-SAME: (%struct.v4f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v4f32* [[OUT]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V3F32:%.*]] poison, <4 x float> [[EXTRACTVEC]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v3f32
-; CHECK-SAME: (%struct.v4f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V3F32:%.*]] @bitcast_struct_v4f32_v3f32.body(%struct.v4f32* poison, <3 x float> [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V3F32:%.*]] @bitcast_struct_v4f32_v3f32.body(ptr poison, <3 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v4f32* [[TMP0]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32.body
-; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <2 x float>*
+; CHECK-SAME: (ptr [[OUT:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V2F32:%.*]] poison, <2 x float> [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32
-; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <2 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V2F32:%.*]] @bitcast_struct_v3f32_v2f32.body(%struct.v3f32* poison, <2 x float> [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], <2 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V2F32:%.*]] @bitcast_struct_v3f32_v2f32.body(ptr poison, <2 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr [[TMP0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32.body
-; CHECK-SAME: (%struct.v3f32.f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32.f32* [[OUT]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] poison, <4 x float> [[EXTRACTVEC]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32
-; CHECK-SAME: (%struct.v3f32.f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] @bitcast_struct_v3f32_f32_v3f32.body(%struct.v3f32.f32* poison, <3 x float> [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] @bitcast_struct_v3f32_f32_v3f32.body(ptr poison, <3 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32.f32* [[TMP0]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32.body
-; CHECK-SAME: (%struct.v3f32.f32* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32.f32* [[OUT]] to <4 x float>*
+; CHECK-SAME: (ptr [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] poison, <4 x float> [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32
-; CHECK-SAME: (%struct.v3f32.f32* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] @bitcast_struct_v3f32_f32_v4f32.body(%struct.v3f32.f32* poison, <4 x float> [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] @bitcast_struct_v3f32_f32_v4f32.body(ptr poison, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32.f32* [[TMP0]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32.body
-; CHECK-SAME: (%struct.i128* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.i128* [[OUT]] to <4 x float>*
+; CHECK-SAME: (ptr [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_I128_V4F32:%.*]] poison, <4 x float> [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_I128_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32
-; CHECK-SAME: (%struct.i128* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_I128_V4F32:%.*]] @bitcast_struct_i128_v4f32.body(%struct.i128* poison, <4 x float> [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_I128_V4F32:%.*]] @bitcast_struct_i128_v4f32.body(ptr poison, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_I128_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.i128* [[TMP0]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32.body
-; CHECK-SAME: ([4 x i32]* [[OUT:%.*]], [4 x float] [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast [4 x i32]* [[OUT]] to [4 x float]*
+; CHECK-SAME: (ptr [[OUT:%.*]], [4 x float] [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_ARRAY_V4I32_V4F32:%.*]] poison, [4 x float] [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32
-; CHECK-SAME: ([4 x i32]* [[TMP0:%.*]], [4 x float] [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_ARRAY_V4I32_V4F32:%.*]] @bitcast_array_v4i32_v4f32.body([4 x i32]* poison, [4 x float] [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], [4 x float] [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_ARRAY_V4I32_V4F32:%.*]] @bitcast_array_v4i32_v4f32.body(ptr poison, [4 x float] [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast [4 x i32]* [[TMP0]] to [4 x float]*
-; CHECK-NEXT:    store [4 x float] [[TMP4]], [4 x float]* [[TMP5]], align 4
+; CHECK-NEXT:    store [4 x float] [[TMP4]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multi_return_bitcast_struct_v3f32_v3f32.body
-; CHECK-SAME: (i1 [[COND:%.*]], %struct.v3f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i1 [[COND:%.*]], ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[COND]], label [[RET0:%.*]], label [[RET1:%.*]]
 ; CHECK:       ret0:
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-; CHECK-NEXT:    [[CAST0:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] poison, <4 x float> [[EXTRACTVEC]], 0
 ; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP0]]
 ; CHECK:       ret1:
-; CHECK-NEXT:    [[CAST1:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, <4 x float> addrspace(1)* poison, align 16
+; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, ptr addrspace(1) poison, align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] poison, <4 x float> [[LOAD]], 0
 ; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multi_return_bitcast_struct_v3f32_v3f32
-; CHECK-SAME: (i1 [[TMP0:%.*]], %struct.v3f32* [[TMP1:%.*]], <3 x float> [[TMP2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = call [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] @multi_return_bitcast_struct_v3f32_v3f32.body(i1 [[TMP0]], %struct.v3f32* poison, <3 x float> [[TMP2]])
+; CHECK-SAME: (i1 [[TMP0:%.*]], ptr [[TMP1:%.*]], <3 x float> [[TMP2:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = call [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] @multi_return_bitcast_struct_v3f32_v3f32.body(i1 [[TMP0]], ptr poison, <3 x float> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.v3f32* [[TMP1]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP6]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP1]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32.body
-; CHECK-SAME: (<3 x float>* [[OUT:%.*]], [[STRUCT_V3F32:%.*]] [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[CAST:%.*]] = bitcast <3 x float>* [[OUT]] to %struct.v3f32*
+; CHECK-SAME: (ptr [[OUT:%.*]], [[STRUCT_V3F32:%.*]] [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_V3F32_STRUCT_V3F32:%.*]] poison, [[STRUCT_V3F32]] [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32
-; CHECK-SAME: (<3 x float>* [[TMP0:%.*]], [[STRUCT_V3F32:%.*]] [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_V3F32_STRUCT_V3F32:%.*]] @bitcast_v3f32_struct_v3f32.body(<3 x float>* poison, [[STRUCT_V3F32]] [[TMP1]])
+; CHECK-SAME: (ptr [[TMP0:%.*]], [[STRUCT_V3F32:%.*]] [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_V3F32_STRUCT_V3F32:%.*]] @bitcast_v3f32_struct_v3f32.body(ptr poison, [[STRUCT_V3F32]] [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <3 x float>* [[TMP0]] to %struct.v3f32*
-; CHECK-NEXT:    store [[STRUCT_V3F32]] [[TMP4]], %struct.v3f32* [[TMP5]], align 16
+; CHECK-NEXT:    store [[STRUCT_V3F32]] [[TMP4]], ptr [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
index 6d4e5e109b1b..0b58b9505052 100644
--- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -10,9 +10,9 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[VRESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) {
   %add = add i32 %b, 65
-  store i32 %add, i32 addrspace(1)* %out
+  store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -20,38 +20,38 @@ define amdgpu_kernel void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
 ; SI: s_endpgm
-define amdgpu_kernel void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k0_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %a, i32 %b) {
   %add0 = add i32 %a, 65
   %add1 = add i32 %b, 65
-  store i32 %add0, i32 addrspace(1)* %out0
-  store i32 %add1, i32 addrspace(1)* %out1
+  store i32 %add0, ptr addrspace(1) %out0
+  store i32 %add1, ptr addrspace(1) %out1
   ret void
 }
 
 ; SI-LABEL: {{^}}s_addk_i32_k1:
 ; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k1(ptr addrspace(1) %out, i32 %b) {
   %add = add i32 %b, 32767 ; (1 << 15) - 1
-  store i32 %add, i32 addrspace(1)* %out
+  store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
 ; SI-LABEL: {{^}}s_addk_i32_k2:
 ; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17
 ; SI: s_endpgm
-define amdgpu_kernel void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k2(ptr addrspace(1) %out, i32 %b) {
   %add = add i32 %b, -17
-  store i32 %add, i32 addrspace(1)* %out
+  store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
 ; SI-LABEL: {{^}}s_addk_i32_k3:
 ; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k3(ptr addrspace(1) %out, i32 %b) {
   %add = add i32 %b, -65
-  store i32 %add, i32 addrspace(1)* %out
+  store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -60,9 +60,9 @@ define amdgpu_kernel void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
 ; SI: s_endpgm
 ; Note: dummy argument here to prevent combining of descriptor loads for %out and %b
-define amdgpu_kernel void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, i32 %dummy, <2 x i32> %b) {
+define amdgpu_kernel void @s_addk_v2i32_k0(ptr addrspace(1) %out, i32 %dummy, <2 x i32> %b) {
   %add = add <2 x i32> %b, <i32 65, i32 66>
-  store <2 x i32> %add, <2 x i32> addrspace(1)* %out
+  store <2 x i32> %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -72,9 +72,9 @@ define amdgpu_kernel void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, i32 %du
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44
 ; SI: s_endpgm
-define amdgpu_kernel void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
+define amdgpu_kernel void @s_addk_v4i32_k0(ptr addrspace(1) %out, <4 x i32> %b) {
   %add = add <4 x i32> %b, <i32 65, i32 66, i32 67, i32 68>
-  store <4 x i32> %add, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -88,18 +88,18 @@ define amdgpu_kernel void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i3
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48
 ; SI: s_endpgm
-define amdgpu_kernel void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
+define amdgpu_kernel void @s_addk_v8i32_k0(ptr addrspace(1) %out, <8 x i32> %b) {
   %add = add <8 x i32> %b, <i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72>
-  store <8 x i32> %add, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %add, ptr addrspace(1) %out
   ret void
 }
 
 ; SI-LABEL: {{^}}no_s_addk_i32_k0:
 ; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @no_s_addk_i32_k0(ptr addrspace(1) %out, i32 %b) {
   %add = add i32 %b, 32768 ; 1 << 15
-  store i32 %add, i32 addrspace(1)* %out
+  store i32 %add, ptr addrspace(1) %out
   ret void
 }
 
@@ -107,10 +107,10 @@ define amdgpu_kernel void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 
 ; SI-LABEL: {{^}}commute_s_addk_i32:
 ; SI: s_addk_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
+define amdgpu_kernel void @commute_s_addk_i32(ptr addrspace(1) %out, i32 %b) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = add i32 %size, %b
-  call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
+  call void asm sideeffect "; foo $0, $1", "v,s"(ptr addrspace(3) @lds, i32 %add)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
index 28ee9637ef9c..a8f350618936 100644
--- a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -6,10 +6,10 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k0(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 4295032831)
   ret void
 }
@@ -19,10 +19,10 @@ define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x7fff, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k1(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 4295000063)
   ret void
 }
@@ -33,10 +33,10 @@ define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x7fff, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k2(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 274877939711)
   ret void
 }
@@ -46,10 +46,10 @@ define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x8000, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k3(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 4295000064)
   ret void
 }
@@ -59,10 +59,10 @@ define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x20000, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k4(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 4295098368)
   ret void
 }
@@ -72,10 +72,10 @@ define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffffef, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xff00ffff, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k5(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 18374967954648334319)
   ret void
 }
@@ -85,10 +85,10 @@ define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x41, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k6(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 270582939713)
   ret void
 }
@@ -98,10 +98,10 @@ define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x2000, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x4000, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k7(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 70368744185856)
   ret void
 }
@@ -111,10 +111,10 @@ define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8000, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k8(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 1229782942255906816)
   ret void
 }
@@ -124,10 +124,10 @@ define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8001, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k9(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 1229782942255906817)
   ret void
 }
@@ -137,10 +137,10 @@ define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8888, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k10(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 1229782942255909000)
   ret void
 }
@@ -150,10 +150,10 @@ define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff8fff, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k11(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 1229782942255910911)
   ret void
 }
@@ -163,10 +163,10 @@ define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffff7001, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x11111111, v[[HI_VREG]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
-  %loada = load i64, i64 addrspace(1)* %a, align 4
+define amdgpu_kernel void @s_movk_i32_k12(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 4
   %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
-  store i64 %or, i64 addrspace(1)* %out
+  store i64 %or, ptr addrspace(1) %out
   call void asm sideeffect "; use $0", "s"(i64 1229782942255902721)
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
index 25ab7201924e..c7987d3d0091 100644
--- a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -7,36 +7,36 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[VRESULT]]
 ; SI: s_endpgm
-define amdgpu_kernel void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) {
   %mul = mul i32 %b, 65
-  store i32 %mul, i32 addrspace(1)* %out
+  store i32 %mul, ptr addrspace(1) %out
   ret void
 }
 
 ; SI-LABEL: {{^}}s_mulk_i32_k1:
 ; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k1(ptr addrspace(1) %out, i32 %b) {
   %mul = mul i32 %b, 32767 ; (1 << 15) - 1
-  store i32 %mul, i32 addrspace(1)* %out
+  store i32 %mul, ptr addrspace(1) %out
   ret void
 }
 
 ; SI-LABEL: {{^}}s_mulk_i32_k2:
 ; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k2(ptr addrspace(1) %out, i32 %b) {
   %mul = mul i32 %b, -17
-  store i32 %mul, i32 addrspace(1)* %out
+  store i32 %mul, ptr addrspace(1) %out
   ret void
 }
 
 ; SI-LABEL: {{^}}no_s_mulk_i32_k0:
 ; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @no_s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) {
   %mul = mul i32 %b, 32769 ; 1 << 15 + 1
-  store i32 %mul, i32 addrspace(1)* %out
+  store i32 %mul, ptr addrspace(1) %out
   ret void
 }
 
@@ -44,10 +44,10 @@ define amdgpu_kernel void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 
 ; SI-LABEL: {{^}}commute_s_mulk_i32:
 ; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
+define amdgpu_kernel void @commute_s_mulk_i32(ptr addrspace(1) %out, i32 %b) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = mul i32 %size, %b
-  call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
+  call void asm sideeffect "; foo $0, $1", "v,s"(ptr addrspace(3) @lds, i32 %add)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 94c946321499..1b0306559295 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -12,13 +12,13 @@ define amdgpu_kernel void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
   %ret0 = sub i32 %t0, %t1
   %ret = add i32 %ret0, %c
 
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_constant_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20
-define amdgpu_kernel void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) {
   %icmp0 = icmp ugt i32 %a, 90
   %t0 = select i1 %icmp0, i32 %a, i32 90
 
@@ -28,13 +28,13 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %
   %ret0 = sub i32 %t0, %t1
   %ret = add i32 %ret0, 20
 
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -42,7 +42,7 @@ define amdgpu_kernel void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b
 
   %ret = add i32 %ret0, %c
 
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
@@ -51,7 +51,7 @@ define amdgpu_kernel void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b
 ; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -59,16 +59,16 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out,
   %t1 = select i1 %icmp1, i32 %a, i32 %b
 
   %ret0 = sub i32 %t0, %t1
-  store volatile i32 %ret0, i32  addrspace(5)*undef
+  store volatile i32 %ret0, ptr addrspace(5) undef
   %ret = add i32 %ret0, %c
 
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -77,17 +77,17 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out,
 
   %ret0 = sub i32 %t0, %t1
   %ret = add i32 %ret0, %c
-  store volatile i32 %ret, i32  addrspace(5)*undef
-  store i32 %ret, i32 addrspace(1)* %out
+  store volatile i32 %ret, ptr addrspace(5) undef
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
-  store volatile i32 %t0, i32  addrspace(5)*undef
+  store volatile i32 %t0, ptr addrspace(5) undef
 
   %icmp1 = icmp ule i32 %a, %b
   %t1 = select i1 %icmp1, i32 %a, i32 %b
@@ -95,40 +95,40 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out,
   %ret0 = sub i32 %t0, %t1
   %ret = add i32 %ret0, %c
 
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
   %icmp1 = icmp ule i32 %a, %b
   %t1 = select i1 %icmp1, i32 %a, i32 %b
 
-  store volatile i32 %t1, i32  addrspace(5)*undef
+  store volatile i32 %t1, ptr addrspace(5) undef
 
   %ret0 = sub i32 %t0, %t1
   %ret = add i32 %ret0, %c
 
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
-  store volatile i32 %sub0, i32  addrspace(5)*undef
+  store volatile i32 %sub0, ptr addrspace(5) undef
   %sub1 = sub i32 %b, %a
   %ret0 = select i1 %icmp0, i32 %sub0, i32 %sub1
 
   %ret = add i32 %ret0, %c
 
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
@@ -136,16 +136,16 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out,
 ; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_cmp_gt_u32 s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
   %ret0 = select i1 %icmp0, i32 %sub0, i32 %sub1
-  store volatile i32 %ret0, i32  addrspace(5)*undef
+  store volatile i32 %ret0, ptr addrspace(5) undef
 
   %ret = add i32 %ret0, %c
 
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
@@ -154,7 +154,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %ou
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %t0 = select <4 x i1> %icmp0, <4 x i32> %a, <4 x i32> %b
 
@@ -164,7 +164,7 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <
   %ret0 = sub <4 x i32> %t0, %t1
   %ret = add <4 x i32> %ret0, %c
 
-  store <4 x i32> %ret, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ret, ptr addrspace(1) %out
   ret void
 }
 
@@ -173,7 +173,7 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %sub0 = sub <4 x i32> %a, %b
   %sub1 = sub <4 x i32> %b, %a
@@ -181,13 +181,13 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <
 
   %ret = add <4 x i32> %ret0, %c
 
-  store <4 x i32> %ret, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %ret, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) {
 
   %icmp0 = icmp ugt i16 %a, %b
   %t0 = select i1 %icmp0, i16 %a, i16 %b
@@ -198,16 +198,16 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i1
   %ret0 = sub i16 %t0, %t1
   %ret = add i16 %ret0, %c
 
-  store i16 %ret, i16 addrspace(1)* %out
+  store i16 %ret, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out) {
-  %a = load volatile i16, i16 addrspace(1)* undef
-  %b = load volatile i16, i16 addrspace(1)* undef
-  %c = load volatile i16, i16 addrspace(1)* undef
+define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
+  %a = load volatile i16, ptr addrspace(1) undef
+  %b = load volatile i16, ptr addrspace(1) undef
+  %c = load volatile i16, ptr addrspace(1) undef
   %icmp0 = icmp ugt i16 %a, %b
   %sub0 = sub i16 %a, %b
   %sub1 = sub i16 %b, %a
@@ -215,13 +215,13 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out) {
 
   %ret = add i16 %ret0, %c
 
-  store i16 %ret, i16 addrspace(1)* %out
+  store i16 %ret, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %t0 = select i1 %icmp0, i8 %a, i8 %b
 
@@ -231,16 +231,16 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b
   %ret0 = sub i8 %t0, %t1
   %ret = add i8 %ret0, %c
 
-  store i8 %ret, i8 addrspace(1)* %out
+  store i8 %ret, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) {
-  %a = load volatile i8, i8 addrspace(1)* undef
-  %b = load volatile i8, i8 addrspace(1)* undef
-  %c = load volatile i8, i8 addrspace(1)* undef
+define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
+  %a = load volatile i8, ptr addrspace(1) undef
+  %b = load volatile i8, ptr addrspace(1) undef
+  %c = load volatile i8, ptr addrspace(1) undef
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
   %sub1 = sub i8 %b, %a
@@ -248,7 +248,7 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) {
 
   %ret = add i8 %ret0, %c
 
-  store i8 %ret, i8 addrspace(1)* %out
+  store i8 %ret, ptr addrspace(1) %out
   ret void
 }
 
@@ -260,7 +260,7 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) {
 ; GCN-DAG: s_sub_i32
 ; GCN-DAG: s_lshr_b32
 ; GCN: s_add_i32
-define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
+define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
   %sub1 = sub i8 %b, %a
@@ -268,7 +268,7 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %
 
   %ret = add i8 %ret0, %c
 
-  store i8 %ret, i8 addrspace(1)* %out
+  store i8 %ret, ptr addrspace(1) %out
   ret void
 }
 
@@ -277,7 +277,7 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %
 ; GCN-DAG: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -287,7 +287,7 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)*
   %ret0 = sub i32 %t0, %t1
   %ret = add i32 %ret0, %c
 
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 
@@ -295,7 +295,7 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)*
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %d
   %sub1 = sub i32 %b, %a
@@ -303,7 +303,7 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)*
 
   %ret = add i32 %ret0, %c
 
-  store i32 %ret, i32 addrspace(1)* %out
+  store i32 %ret, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index a9a3945e2f94..1d91d59892ae 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -12,7 +12,7 @@ declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 
 declare { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
 ; SI-LABEL: saddo_i64_zext:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -120,11 +120,11 @@ define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b
   %carry = extractvalue { i64, i1 } %sadd, 1
   %ext = zext i1 %carry to i64
   %add2 = add i64 %val, %ext
-  store i64 %add2, i64 addrspace(1)* %out, align 8
+  store i64 %add2, ptr addrspace(1) %out, align 8
   ret void
 }
 
-define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind {
 ; SI-LABEL: s_saddo_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
@@ -224,12 +224,12 @@ define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %sadd, 0
   %carry = extractvalue { i32, i1 } %sadd, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
-define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; SI-LABEL: v_saddo_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -336,17 +336,17 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 ; GFX11-NEXT:    global_store_b8 v0, v2, s[2:3]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %a = load i32, ptr addrspace(1) %aptr, align 4
+  %b = load i32, ptr addrspace(1) %bptr, align 4
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %sadd, 0
   %carry = extractvalue { i32, i1 } %sadd, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
-define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind {
 ; SI-LABEL: s_saddo_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -453,12 +453,12 @@ define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
   %carry = extractvalue { i64, i1 } %sadd, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out, align 8
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
-define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; SI-LABEL: v_saddo_i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -574,17 +574,17 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; GFX11-NEXT:    global_store_b8 v6, v0, s[6:7]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %a = load i64, i64 addrspace(1)* %aptr, align 4
-  %b = load i64, i64 addrspace(1)* %bptr, align 4
+  %a = load i64, ptr addrspace(1) %aptr, align 4
+  %b = load i64, ptr addrspace(1) %bptr, align 4
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
   %carry = extractvalue { i64, i1 } %sadd, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out, align 8
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
-define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
 ; SI-LABEL: v_saddo_v2i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@@ -713,13 +713,13 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 ; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[2:3]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
-  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
-  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+  %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
+  %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
   %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
   %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
   %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
-  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  store <2 x i32> %val, ptr addrspace(1) %out, align 4
   %carry.ext = zext <2 x i1> %carry to <2 x i32>
-  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+  store <2 x i32> %carry.ext, ptr addrspace(1) %carryout
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index c5f8abc08f00..3307b837d393 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -24,7 +24,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() #0
 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
 
-define amdgpu_kernel void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @mubuf(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.y()
@@ -35,14 +35,14 @@ entry:
 loop:                                             ; preds = %loop, %entry
   %tmp4 = phi i64 [ 0, %entry ], [ %tmp5, %loop ]
   %tmp5 = add i64 %tmp2, %tmp4
-  %tmp6 = getelementptr i8, i8 addrspace(1)* %in, i64 %tmp5
-  %tmp7 = load i8, i8 addrspace(1)* %tmp6, align 1
+  %tmp6 = getelementptr i8, ptr addrspace(1) %in, i64 %tmp5
+  %tmp7 = load i8, ptr addrspace(1) %tmp6, align 1
   %tmp8 = or i64 %tmp5, 1
-  %tmp9 = getelementptr i8, i8 addrspace(1)* %in, i64 %tmp8
-  %tmp10 = load i8, i8 addrspace(1)* %tmp9, align 1
+  %tmp9 = getelementptr i8, ptr addrspace(1) %in, i64 %tmp8
+  %tmp10 = load i8, ptr addrspace(1) %tmp9, align 1
   %tmp11 = add i8 %tmp7, %tmp10
   %tmp12 = sext i8 %tmp11 to i32
-  store i32 %tmp12, i32 addrspace(1)* %out
+  store i32 %tmp12, ptr addrspace(1) %out
   %tmp13 = icmp slt i64 %tmp5, 10
   br i1 %tmp13, label %loop, label %done
 
@@ -64,25 +64,25 @@ done:                                             ; preds = %loop
 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
 ; GCN-NOHSA: buffer_store_dword [[V_OUT]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
-define amdgpu_kernel void @smrd_valu(i32 addrspace(4)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @smrd_valu(ptr addrspace(1) %in, i32 %a, i32 %b, ptr addrspace(1) %out) #1 {
 entry:
   %tmp = icmp ne i32 %a, 0
   br i1 %tmp, label %if, label %else
 
 if:                                               ; preds = %entry
-  %tmp1 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %in
+  %tmp1 = load ptr addrspace(4), ptr addrspace(1) %in
   br label %endif
 
 else:                                             ; preds = %entry
-  %tmp2 = getelementptr i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %in
-  %tmp3 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %tmp2
+  %tmp2 = getelementptr ptr addrspace(4), ptr addrspace(1) %in
+  %tmp3 = load ptr addrspace(4), ptr addrspace(1) %tmp2
   br label %endif
 
 endif:                                            ; preds = %else, %if
-  %tmp4 = phi i32 addrspace(4)* [ %tmp1, %if ], [ %tmp3, %else ]
-  %tmp5 = getelementptr i32, i32 addrspace(4)* %tmp4, i32 3000
-  %tmp6 = load i32, i32 addrspace(4)* %tmp5
-  store i32 %tmp6, i32 addrspace(1)* %out
+  %tmp4 = phi ptr addrspace(4) [ %tmp1, %if ], [ %tmp3, %else ]
+  %tmp5 = getelementptr i32, ptr addrspace(4) %tmp4, i32 3000
+  %tmp6 = load i32, ptr addrspace(4) %tmp5
+  store i32 %tmp6, ptr addrspace(1) %out
   ret void
 }
 
@@ -92,13 +92,13 @@ endif:                                            ; preds = %else, %if
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(4)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2(ptr addrspace(1) %out, ptr addrspace(4) %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
-  %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(4)* %in, i32 %tmp, i32 4
-  %tmp3 = load i32, i32 addrspace(4)* %tmp2
-  store i32 %tmp3, i32 addrspace(1)* %out
+  %tmp2 = getelementptr [8 x i32], ptr addrspace(4) %in, i32 %tmp, i32 4
+  %tmp3 = load i32, ptr addrspace(4) %tmp2
+  store i32 %tmp3, ptr addrspace(1) %out
   ret void
 }
 
@@ -112,14 +112,14 @@ entry:
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
-define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr i32, i32 addrspace(4)* %in, i32 %tmp
-  %tmp3 = getelementptr i32, i32 addrspace(4)* %tmp2, i32 5000
-  %tmp4 = load i32, i32 addrspace(4)* %tmp3
+  %tmp2 = getelementptr i32, ptr addrspace(4) %in, i32 %tmp
+  %tmp3 = getelementptr i32, ptr addrspace(4) %tmp2, i32 5000
+  %tmp4 = load i32, ptr addrspace(4) %tmp3
   %tmp5 = add i32 %tmp4, %c
-  store i32 %tmp5, i32 addrspace(1)* %out
+  store i32 %tmp5, ptr addrspace(1) %out
   ret void
 }
 
@@ -132,14 +132,14 @@ entry:
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(4)* %in, i64 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x2(ptr addrspace(1) %out, ptr addrspace(4) %in, i64 %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr i64, i64 addrspace(4)* %in, i32 %tmp
-  %tmp3 = getelementptr i64, i64 addrspace(4)* %tmp2, i32 5000
-  %tmp4 = load i64, i64 addrspace(4)* %tmp3
+  %tmp2 = getelementptr i64, ptr addrspace(4) %in, i32 %tmp
+  %tmp3 = getelementptr i64, ptr addrspace(4) %tmp2, i32 5000
+  %tmp4 = load i64, ptr addrspace(4) %tmp3
   %tmp5 = or i64 %tmp4, %c
-  store i64 %tmp5, i64 addrspace(1)* %out
+  store i64 %tmp5, ptr addrspace(1) %out
   ret void
 }
 
@@ -154,14 +154,14 @@ entry:
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(4)* %in, <4 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x4(ptr addrspace(1) %out, ptr addrspace(4) %in, <4 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %in, i32 %tmp
-  %tmp3 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %tmp2, i32 1234
-  %tmp4 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp3
+  %tmp2 = getelementptr <4 x i32>, ptr addrspace(4) %in, i32 %tmp
+  %tmp3 = getelementptr <4 x i32>, ptr addrspace(4) %tmp2, i32 1234
+  %tmp4 = load <4 x i32>, ptr addrspace(4) %tmp3
   %tmp5 = or <4 x i32> %tmp4, %c
-  store <4 x i32> %tmp5, <4 x i32> addrspace(1)* %out
+  store <4 x i32> %tmp5, ptr addrspace(1) %out
   ret void
 }
 
@@ -188,14 +188,14 @@ entry:
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(4)* %in, <8 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x8(ptr addrspace(1) %out, ptr addrspace(4) %in, <8 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %in, i32 %tmp
-  %tmp3 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %tmp2, i32 1234
-  %tmp4 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp3
+  %tmp2 = getelementptr <8 x i32>, ptr addrspace(4) %in, i32 %tmp
+  %tmp3 = getelementptr <8 x i32>, ptr addrspace(4) %tmp2, i32 1234
+  %tmp4 = load <8 x i32>, ptr addrspace(4) %tmp3
   %tmp5 = or <8 x i32> %tmp4, %c
-  store <8 x i32> %tmp5, <8 x i32> addrspace(1)* %out
+  store <8 x i32> %tmp5, ptr addrspace(1) %out
   ret void
 }
 
@@ -234,14 +234,14 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; GCN: s_endpgm
-define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(4)* %in, <16 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x16(ptr addrspace(1) %out, ptr addrspace(4) %in, <16 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(4)* %in, i32 %tmp
-  %tmp3 = getelementptr <16 x i32>, <16 x i32> addrspace(4)* %tmp2, i32 1234
-  %tmp4 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp3
+  %tmp2 = getelementptr <16 x i32>, ptr addrspace(4) %in, i32 %tmp
+  %tmp3 = getelementptr <16 x i32>, ptr addrspace(4) %tmp2, i32 1234
+  %tmp4 = load <16 x i32>, ptr addrspace(4) %tmp3
   %tmp5 = or <16 x i32> %tmp4, %c
-  store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out
+  store <16 x i32> %tmp5, ptr addrspace(1) %out
   ret void
 }
 
@@ -251,27 +251,27 @@ entry:
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
 ; GCN-NOHSA: buffer_store_dword [[ADD]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[ADD]]
-define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(4)* %in, i32 %a) #1 {
+define amdgpu_kernel void @smrd_valu2_salu_user(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %a) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
-  %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(4)* %in, i32 %tmp, i32 4
-  %tmp3 = load i32, i32 addrspace(4)* %tmp2
+  %tmp2 = getelementptr [8 x i32], ptr addrspace(4) %in, i32 %tmp, i32 4
+  %tmp3 = load i32, ptr addrspace(4) %tmp2
   %tmp4 = add i32 %tmp3, %a
-  store i32 %tmp4, i32 addrspace(1)* %out
+  store i32 %tmp4, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(4)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_max_smrd_offset(ptr addrspace(1) %out, ptr addrspace(4) %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
-  %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(4)* %in, i32 %tmp, i32 255
-  %tmp3 = load i32, i32 addrspace(4)* %tmp2
-  store i32 %tmp3, i32 addrspace(1)* %out
+  %tmp2 = getelementptr [1024 x i32], ptr addrspace(4) %in, i32 %tmp, i32 255
+  %tmp3 = load i32, ptr addrspace(4) %tmp2
+  store i32 %tmp3, ptr addrspace(1) %out
   ret void
 }
 
@@ -279,13 +279,13 @@ entry:
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
-define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(4)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(4) %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
-  %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(4)* %in, i32 %tmp, i32 256
-  %tmp3 = load i32, i32 addrspace(4)* %tmp2
-  store i32 %tmp3, i32 addrspace(1)* %out
+  %tmp2 = getelementptr [1024 x i32], ptr addrspace(4) %in, i32 %tmp, i32 256
+  %tmp3 = load i32, ptr addrspace(4) %tmp2
+  store i32 %tmp3, ptr addrspace(1) %out
   ret void
 }
 
@@ -294,13 +294,12 @@ entry:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
-  %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <8 x i32> addrspace(4)*
-  %tmp3 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp2, align 4
-  store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32
+  %tmp1 = getelementptr inbounds i32, ptr addrspace(4) %in, i32 %tmp0
+  %tmp3 = load <8 x i32>, ptr addrspace(4) %tmp1, align 4
+  store <8 x i32> %tmp3, ptr addrspace(1) %out, align 32
   ret void
 }
 
@@ -317,12 +316,11 @@ entry:
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32_salu_user(ptr addrspace(1) %out, ptr addrspace(4) nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
-  %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <8 x i32> addrspace(4)*
-  %tmp3 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp2, align 4
+  %tmp1 = getelementptr inbounds i32, ptr addrspace(4) %in, i32 %tmp0
+  %tmp3 = load <8 x i32>, ptr addrspace(4) %tmp1, align 4
 
   %elt0 = extractelement <8 x i32> %tmp3, i32 0
   %elt1 = extractelement <8 x i32> %tmp3, i32 1
@@ -341,7 +339,7 @@ entry:
   %add5 = add i32 %add4, %elt6
   %add6 = add i32 %add5, %elt7
 
-  store i32 %add6, i32 addrspace(1)* %out
+  store i32 %add6, ptr addrspace(1) %out
   ret void
 }
 
@@ -354,13 +352,12 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32(ptr addrspace(1) %out, ptr addrspace(4) nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
-  %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <16 x i32> addrspace(4)*
-  %tmp3 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp2, align 4
-  store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32
+  %tmp1 = getelementptr inbounds i32, ptr addrspace(4) %in, i32 %tmp0
+  %tmp3 = load <16 x i32>, ptr addrspace(4) %tmp1, align 4
+  store <16 x i32> %tmp3, ptr addrspace(1) %out, align 32
   ret void
 }
 
@@ -389,12 +386,11 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32_salu_user(ptr addrspace(1) %out, ptr addrspace(4) nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
-  %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <16 x i32> addrspace(4)*
-  %tmp3 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp2, align 4
+  %tmp1 = getelementptr inbounds i32, ptr addrspace(4) %in, i32 %tmp0
+  %tmp3 = load <16 x i32>, ptr addrspace(4) %tmp1, align 4
 
   %elt0 = extractelement <16 x i32> %tmp3, i32 0
   %elt1 = extractelement <16 x i32> %tmp3, i32 1
@@ -429,7 +425,7 @@ entry:
   %add13 = add i32 %add12, %elt14
   %add14 = add i32 %add13, %elt15
 
-  store i32 %add14, i32 addrspace(1)* %out
+  store i32 %add14, ptr addrspace(1) %out
   ret void
 }
 
@@ -444,7 +440,7 @@ entry:
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
 ; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
-define amdgpu_kernel void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sopc_vopc_legalize_bug(i32 %cond, ptr addrspace(1) %out, ptr addrspace(1) %in) {
 bb3:                                              ; preds = %bb2
   %tmp0 = bitcast i32 %cond to float
   %tmp1 = fadd float %tmp0, 2.500000e-01
@@ -453,7 +449,7 @@ bb3:                                              ; preds = %bb2
   br i1 %tmp3, label %bb6, label %bb7
 
 bb6:
-  store i32 1, i32 addrspace(1)* %out
+  store i32 1, ptr addrspace(1) %out
   br label %bb7
 
 bb7:                                              ; preds = %bb3
@@ -478,7 +474,7 @@ bb2:
 
 bb4:
   %tmp5 = phi i32 [ %tmp3, %bb2 ], [ %tmp, %bb1 ]
-  store volatile i32 %tmp5, i32 addrspace(1)* undef
+  store volatile i32 %tmp5, ptr addrspace(1) undef
   br label %bb1
 }
 
@@ -487,7 +483,7 @@ bb4:
 ; GCN: [[LOOP_LABEL:.L[0-9a-zA-Z_]+]]:
 ; GCN: s_xor_b32 [[B]], [[B]], 0x400
 ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]]
-define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
+define amdgpu_kernel void @phi_imm_in_sgprs(ptr addrspace(3) %out, i32 %cond) {
 entry:
   br label %loop
 
@@ -496,8 +492,8 @@ loop:
   %offset = phi i32 [1024, %entry], [%offset.xor, %loop]
   %offset.xor = xor i32 %offset, 1024
   %offset.i = add i32 %offset.xor, %i
-  %ptr = getelementptr i32, i32 addrspace(3)* %out, i32 %offset.i
-  store i32 0, i32 addrspace(3)* %ptr
+  %ptr = getelementptr i32, ptr addrspace(3) %out, i32 %offset.i
+  store i32 0, ptr addrspace(3) %ptr
   %i.add = add i32 %i, 1
   %cmp = icmp ult i32 %i.add, %cond
   br i1 %cmp, label %loop, label %exit

diff  --git a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
index c95beaea5a7e..363d568f9c11 100644
--- a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
@@ -15,8 +15,8 @@
 define amdgpu_ps float @nonuniform_uniform(i32 %arg18) {
 .entry:
   %tmp31 = sext i32 %arg18 to i64
-  %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31
-  %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
+  %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) @indexable, i64 0, i64 %tmp31
+  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
@@ -29,9 +29,9 @@ define amdgpu_ps float @nonuniform_uniform(i32 %arg18) {
 define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) {
 .entry:
   %tmp1 = zext i32 %arg18 to i64
-  %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
-  %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset
-  %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
+  %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
+  %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset
+  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
@@ -44,9 +44,9 @@ define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) {
 define amdgpu_ps float @const_nonuniform(i32 %arg18) {
 .entry:
   %tmp1 = zext i32 %arg18 to i64
-  %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
-  %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 1
-  %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
+  %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
+  %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 1
+  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
@@ -59,9 +59,9 @@ define amdgpu_ps float @const_nonuniform(i32 %arg18) {
 define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) {
 .entry:
   %tmp1 = zext i32 %arg18 to i64
-  %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
-  %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset
-  %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
+  %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
+  %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset
+  %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16
   %tmp34 = extractelement <3 x float> %tmp33, i32 0
   ret float %tmp34
 }
@@ -73,8 +73,8 @@ define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) {
 define amdgpu_ps float @nonuniform_uniform_const(i32 %arg18) {
 .entry:
   %tmp31 = sext i32 %arg18 to i64
-  %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31, i64 1
-  %tmp33 = load float, float addrspace(1)* %tmp32, align 4
+  %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) @indexable, i64 0, i64 %tmp31, i64 1
+  %tmp33 = load float, ptr addrspace(1) %tmp32, align 4
   ret float %tmp33
 }
 
@@ -85,9 +85,9 @@ define amdgpu_ps float @nonuniform_uniform_const(i32 %arg18) {
 define amdgpu_ps float @uniform_nonuniform_const(i32 inreg %offset, i32 %arg18) {
 .entry:
   %tmp1 = zext i32 %arg18 to i64
-  %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
-  %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1
-  %tmp33 = load float, float addrspace(1)* %tmp32, align 4
+  %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
+  %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset, i32 1
+  %tmp33 = load float, ptr addrspace(1) %tmp32, align 4
   ret float %tmp33
 }
 
@@ -98,9 +98,9 @@ define amdgpu_ps float @uniform_nonuniform_const(i32 inreg %offset, i32 %arg18)
 define amdgpu_ps float @nonuniform_nonuniform_const(i32 %offset, i32 %arg18) {
 .entry:
   %tmp1 = zext i32 %arg18 to i64
-  %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
-  %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1
-  %tmp33 = load float, float addrspace(1)* %tmp32, align 4
+  %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1)
+  %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset, i32 1
+  %tmp33 = load float, ptr addrspace(1) %tmp32, align 4
   ret float %tmp33
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index ac9e3bf33a96..56e56c07b0f0 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -8,7 +8,7 @@
 ; Test that add/sub with a constant is swapped to sub/add with negated
 ; constant to minimize code size.
 
-define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i32_x_sub_64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -77,15 +77,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrs
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %x = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load i32, ptr addrspace(1) %gep
   %result = sub i32 %x, 64
-  store i32 %result, i32 addrspace(1)* %gep.out
+  store i32 %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i32_x_sub_64_multi_use:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -184,18 +184,18 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile i32, i32 addrspace(1)* %gep
-  %y = load volatile i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile i32, ptr addrspace(1) %gep
+  %y = load volatile i32, ptr addrspace(1) %gep
   %result0 = sub i32 %x, 64
   %result1 = sub i32 %y, 64
-  store volatile i32 %result0, i32 addrspace(1)* %gep.out
-  store volatile i32 %result1, i32 addrspace(1)* %gep.out
+  store volatile i32 %result0, ptr addrspace(1) %gep.out
+  store volatile i32 %result1, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i32_64_sub_x:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -264,15 +264,15 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrs
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %x = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load i32, ptr addrspace(1) %gep
   %result = sub i32 64, %x
-  store i32 %result, i32 addrspace(1)* %gep.out
+  store i32 %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i32_x_sub_65:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -341,15 +341,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrs
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %x = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load i32, ptr addrspace(1) %gep
   %result = sub i32 %x, 65
-  store i32 %result, i32 addrspace(1)* %gep.out
+  store i32 %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i32_65_sub_x:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -418,15 +418,15 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrs
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %x = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load i32, ptr addrspace(1) %gep
   %result = sub i32 65, %x
-  store i32 %result, i32 addrspace(1)* %gep.out
+  store i32 %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i32_x_sub_neg16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -495,15 +495,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 ad
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %x = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load i32, ptr addrspace(1) %gep
   %result = sub i32 %x, -16
-  store i32 %result, i32 addrspace(1)* %gep.out
+  store i32 %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i32_neg16_sub_x:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -572,15 +572,15 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 ad
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %x = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load i32, ptr addrspace(1) %gep
   %result = sub i32 -16, %x
-  store i32 %result, i32 addrspace(1)* %gep.out
+  store i32 %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i32_x_sub_neg17:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -649,15 +649,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 ad
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %x = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load i32, ptr addrspace(1) %gep
   %result = sub i32 %x, -17
-  store i32 %result, i32 addrspace(1)* %gep.out
+  store i32 %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i32_neg17_sub_x:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -726,11 +726,11 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 ad
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %x = load i32, i32 addrspace(1)* %gep
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load i32, ptr addrspace(1) %gep
   %result = sub i32 -17, %x
-  store i32 %result, i32 addrspace(1)* %gep.out
+  store i32 %result, ptr addrspace(1) %gep.out
   ret void
 }
 
@@ -789,7 +789,7 @@ define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 {
   ret void
 }
 
-define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i16_x_sub_64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -858,15 +858,15 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrs
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
-  %x = load i16, i16 addrspace(1)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load i16, ptr addrspace(1) %gep
   %result = sub i16 %x, 64
-  store i16 %result, i16 addrspace(1)* %gep.out
+  store i16 %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i16_x_sub_64_zext_to_i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -944,16 +944,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %ou
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %x = load i16, i16 addrspace(1)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load i16, ptr addrspace(1) %gep
   %result = sub i16 %x, 64
   %zext = zext i16 %result to i32
-  store i32 %zext, i32 addrspace(1)* %gep.out
+  store i32 %zext, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_i16_x_sub_64_multi_use:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1052,18 +1052,18 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile i16, i16 addrspace(1)* %gep
-  %y = load volatile i16, i16 addrspace(1)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile i16, ptr addrspace(1) %gep
+  %y = load volatile i16, ptr addrspace(1) %gep
   %result0 = sub i16 %x, 64
   %result1 = sub i16 %y, 64
-  store volatile i16 %result0, i16 addrspace(1)* %gep.out
-  store volatile i16 %result1, i16 addrspace(1)* %gep.out
+  store volatile i16 %result0, ptr addrspace(1) %gep.out
+  store volatile i16 %result1, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_sub_64_64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1138,15 +1138,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = sub <2 x i16> %x, <i16 64, i16 64>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_sub_7_64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1222,15 +1222,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out,
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = sub <2 x i16> %x, <i16 7, i16 64>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_sub_64_123:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1306,16 +1306,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = sub <2 x i16> %x, <i16 64, i16 123>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; Can fold 0 and inline immediate in other half.
-define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_sub_7_0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1388,16 +1388,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out,
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = sub <2 x i16> %x, <i16 7, i16 0>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; Can fold 0 and inline immediate in other half.
-define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_sub_0_16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1468,15 +1468,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(<2 x i16> addrspace(1)* %out,
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = sub <2 x i16> %x, <i16 0, i16 16>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_sub_0_1_0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1548,15 +1548,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(<2 x i16> addrspace(1)* %out
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = sub <2 x i16> %x, <i16 0, i16 -15360>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_sub_0_neg1_0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1628,16 +1628,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(<2 x i16> addrspace(1)* %
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = sub <2 x i16> %x, <i16 0, i16 17408>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; -32 isn't an inline immediate, but 32 is
-define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_neg32_neg32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1712,15 +1712,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 -32, i16 -32>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_0_neg32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1791,15 +1791,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(<2 x i16> addrspace(1)* %o
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 0, i16 -32>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_neg32_0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1872,16 +1872,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 -32, i16 0>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
 ; 16 and -16 are both inline immediates
-define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_neg16_neg16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -1956,15 +1956,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 -16, i16 -16>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_0_neg16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2035,15 +2035,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(<2 x i16> addrspace(1)* %o
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 0, i16 -16>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_neg16_0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2116,15 +2116,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %o
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 -16, i16 0>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_neg_fpone:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2200,15 +2200,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)*
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 -15360, i16 -15360>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_neg_negfpone:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2284,15 +2284,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 17408, i16 17408>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_neg_fptwo:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2368,15 +2368,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)*
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 16384, i16 16384>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_neg_negfptwo:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2452,15 +2452,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 -16384, i16 -16384>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_undef_neg32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2531,15 +2531,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(<2 x i16> addrspace(1)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 undef, i16 -32>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 
-define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-LABEL: v_test_v2i16_x_add_neg32_undef:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -2609,11 +2609,11 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(<2 x i16> addrspace(1)
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %x = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load <2 x i16>, ptr addrspace(1) %gep
   %result = add <2 x i16> %x, <i16 -32, i16 undef>
-  store <2 x i16> %result, <2 x i16> addrspace(1)* %gep.out
+  store <2 x i16> %result, ptr addrspace(1) %gep.out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index bc978cc3347f..e0ccbcda7fa0 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
- at stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
- at stored_constant_ptr = addrspace(3) global i32 addrspace(4)* undef, align 8
- at stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
+ at stored_lds_ptr = addrspace(3) global ptr addrspace(3) undef, align 4
+ at stored_constant_ptr = addrspace(3) global ptr addrspace(4) undef, align 8
+ at stored_global_ptr = addrspace(3) global ptr addrspace(1) undef, align 8
 
 ; GCN-LABEL: {{^}}reorder_local_load_global_store_local_load:
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
@@ -12,19 +12,19 @@
 ; GFX9: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; GFX9: global_store_dword
 ; GFX9: global_store_dword
-define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+define amdgpu_kernel void @reorder_local_load_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 {
+  %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4
 
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3
 
-  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
-  store i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+  %tmp1 = load i32, ptr addrspace(3) %ptr1, align 4
+  store i32 99, ptr addrspace(1) %gptr, align 4
+  %tmp2 = load i32, ptr addrspace(3) %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -36,19 +36,19 @@ define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrsp
 ; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
 ; GFX9: global_store_dword
 ; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
-define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 {
+  %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4
 
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3
 
-  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
-  store volatile i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+  %tmp1 = load i32, ptr addrspace(3) %ptr1, align 4
+  store volatile i32 99, ptr addrspace(1) %gptr, align 4
+  %tmp2 = load i32, ptr addrspace(3) %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -62,20 +62,20 @@ define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_loa
 ; GFX9: s_barrier
 ; GFX9-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 ; GFX9-DAG: global_store_dword
-define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 {
+  %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4
 
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3
 
-  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
-  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp1 = load i32, ptr addrspace(3) %ptr1, align 4
+  store i32 99, ptr addrspace(1) %gptr, align 4
   call void @llvm.amdgcn.s.barrier() #1
-  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+  %tmp2 = load i32, ptr addrspace(3) %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -93,19 +93,19 @@ define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load
 
 ; CI: buffer_store_dword
 ; GFX9: global_store_dword
-define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
-  %ptr0 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(3)* @stored_constant_ptr, align 8
+define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(ptr addrspace(1) %out, ptr addrspace(1) %gptr) #0 {
+  %ptr0 = load ptr addrspace(4), ptr addrspace(3) @stored_constant_ptr, align 8
 
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 3
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 3
 
-  %tmp1 = load i32, i32 addrspace(4)* %ptr1, align 4
-  store i32 99, i32 addrspace(1)* %gptr, align 4
-  %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4
+  %tmp1 = load i32, ptr addrspace(4) %ptr1, align 4
+  store i32 99, ptr addrspace(1) %gptr, align 4
+  %tmp2 = load i32, ptr addrspace(4) %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -122,19 +122,19 @@ define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32
 ; GCN-DAG: ds_write_b32
 ; CI: buffer_store_dword
 ; GFX9: global_store_dword
-define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
-  %ptr0 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(3)* @stored_constant_ptr, align 8
+define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(ptr addrspace(1) %out, ptr addrspace(3) %lptr) #0 {
+  %ptr0 = load ptr addrspace(4), ptr addrspace(3) @stored_constant_ptr, align 8
 
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 3
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 3
 
-  %tmp1 = load i32, i32 addrspace(4)* %ptr1, align 4
-  store i32 99, i32 addrspace(3)* %lptr, align 4
-  %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4
+  %tmp1 = load i32, ptr addrspace(4) %ptr1, align 4
+  store i32 99, ptr addrspace(3) %lptr, align 4
+  %tmp2 = load i32, ptr addrspace(4) %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -145,17 +145,17 @@ define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 a
 ; GCN: ds_write_b32
 ; CI: buffer_store_dword
 ; GFX9: global_store_dword
-define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(4)* %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(4)* %ptr0, i64 2
+define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(ptr addrspace(1) %out, ptr addrspace(3) noalias %lptr, ptr addrspace(4) %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(4) %ptr0, i64 2
 
-  %tmp1 = load i32, i32 addrspace(4)* %ptr1, align 4
-  store i32 99, i32 addrspace(3)* %lptr, align 4
-  %tmp2 = load i32, i32 addrspace(4)* %ptr2, align 4
+  %tmp1 = load i32, ptr addrspace(4) %ptr1, align 4
+  store i32 99, ptr addrspace(3) %lptr, align 4
+  %tmp2 = load i32, ptr addrspace(4) %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -168,17 +168,17 @@ define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace
 ; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
 ; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
 ; GFX9: ds_write_b32
-define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3
+define amdgpu_kernel void @reorder_global_load_local_store_global_load(ptr addrspace(1) %out, ptr addrspace(3) %lptr, ptr addrspace(1) %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i64 3
 
-  %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4
-  store i32 99, i32 addrspace(3)* %lptr, align 4
-  %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4
+  %tmp1 = load i32, ptr addrspace(1) %ptr1, align 4
+  store i32 99, ptr addrspace(3) %lptr, align 4
+  %tmp2 = load i32, ptr addrspace(1) %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
 
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -189,21 +189,21 @@ define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrs
 ; CI: buffer_store_dword
 ; GFX9: global_store_dword
 ; GCN: s_endpgm
-define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
-  %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 102
-
-  store i32 123, i32 addrspace(3)* %ptr1, align 4
-  %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4
-  %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4
-  store i32 123, i32 addrspace(3)* %ptr2, align 4
-  %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4
-  store i32 789, i32 addrspace(3)* %ptr3, align 4
+define amdgpu_kernel void @reorder_local_offsets(ptr addrspace(1) nocapture %out, ptr addrspace(1) noalias nocapture readnone %gptr, ptr addrspace(3) noalias nocapture %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 102
+
+  store i32 123, ptr addrspace(3) %ptr1, align 4
+  %tmp1 = load i32, ptr addrspace(3) %ptr2, align 4
+  %tmp2 = load i32, ptr addrspace(3) %ptr3, align 4
+  store i32 123, ptr addrspace(3) %ptr2, align 4
+  %tmp3 = load i32, ptr addrspace(3) %ptr1, align 4
+  store i32 789, ptr addrspace(3) %ptr3, align 4
 
   %add.0 = add nsw i32 %tmp2, %tmp1
   %add.1 = add nsw i32 %add.0, %tmp3
-  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  store i32 %add.1, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -223,21 +223,21 @@ define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %ou
 ; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:408
 ; GFX9: global_store_dword
 ; GFX9: s_endpgm
-define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
-  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 102
-
-  store i32 123, i32 addrspace(1)* %ptr1, align 4
-  %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4
-  %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4
-  store i32 123, i32 addrspace(1)* %ptr2, align 4
-  %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4
-  store i32 789, i32 addrspace(1)* %ptr3, align 4
+define amdgpu_kernel void @reorder_global_offsets(ptr addrspace(1) nocapture %out, ptr addrspace(1) noalias nocapture readnone %gptr, ptr addrspace(1) noalias nocapture %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 102
+
+  store i32 123, ptr addrspace(1) %ptr1, align 4
+  %tmp1 = load i32, ptr addrspace(1) %ptr2, align 4
+  %tmp2 = load i32, ptr addrspace(1) %ptr3, align 4
+  store i32 123, ptr addrspace(1) %ptr2, align 4
+  %tmp3 = load i32, ptr addrspace(1) %ptr1, align 4
+  store i32 789, ptr addrspace(1) %ptr3, align 4
 
   %add.0 = add nsw i32 %tmp2, %tmp1
   %add.1 = add nsw i32 %add.0, %tmp3
-  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  store i32 %add.1, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -266,49 +266,49 @@ define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %o
 ; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36
 ; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52
 
-define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
+define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(ptr addrspace(1) noalias nocapture %ptr.base) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
 
-  %ptr0 = getelementptr inbounds i32, i32 addrspace(1)* %ptr.base, i64 %id.ext
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 5
-  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 7
-  %ptr4 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 9
-  %ptr5 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 11
-  %ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 13
-
-  store i32 789, i32 addrspace(1)* %ptr0, align 4
-  %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4
-  store i32 123, i32 addrspace(1)* %ptr2, align 4
-  %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4
+  %ptr0 = getelementptr inbounds i32, ptr addrspace(1) %ptr.base, i64 %id.ext
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 5
+  %ptr3 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 7
+  %ptr4 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 9
+  %ptr5 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 11
+  %ptr6 = getelementptr inbounds i32, ptr addrspace(1) %ptr0, i32 13
+
+  store i32 789, ptr addrspace(1) %ptr0, align 4
+  %tmp1 = load i32, ptr addrspace(1) %ptr1, align 4
+  store i32 123, ptr addrspace(1) %ptr2, align 4
+  %tmp2 = load i32, ptr addrspace(1) %ptr3, align 4
   %add.0 = add nsw i32 %tmp1, %tmp2
-  store i32 %add.0, i32 addrspace(1)* %ptr4, align 4
-  %tmp3 = load i32, i32 addrspace(1)* %ptr5, align 4
+  store i32 %add.0, ptr addrspace(1) %ptr4, align 4
+  %tmp3 = load i32, ptr addrspace(1) %ptr5, align 4
   %add.1 = add nsw i32 %add.0, %tmp3
-  store i32 %add.1, i32 addrspace(1)* %ptr6, align 4
+  store i32 %add.1, ptr addrspace(1) %ptr6, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}reorder_local_load_tbuffer_store_local_load:
 ; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:2
 ; GCN: tbuffer_store_format
-define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 {
-  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(ptr addrspace(1) %out, i32 %a1, i32 %vaddr) #0 {
+  %ptr0 = load ptr addrspace(3), ptr addrspace(3) @stored_lds_ptr, align 4
 
-  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
-  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
+  %ptr1 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32, ptr addrspace(3) %ptr0, i32 2
 
-  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
+  %tmp1 = load i32, ptr addrspace(3) %ptr1, align 4
 
   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
   %vaddr.add = add i32 %vaddr, 32
   call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef, i32 %vaddr.add, i32 0, i32 0, i32 228, i32 3)
 
-  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
+  %tmp2 = load i32, ptr addrspace(3) %ptr2, align 4
 
   %add = add nsw i32 %tmp1, %tmp2
-  store i32 %add, i32 addrspace(1)* %out, align 4
+  store i32 %add, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll b/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
index 7990990478af..13fc406c357d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
@@ -12,81 +12,81 @@
 ; CHECK: buffer_store_byte
 ; ModuleID = 'radeon'
 
-define amdgpu_kernel void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
+define amdgpu_kernel void @test_8_min_char(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture readonly %in0, ptr addrspace(1) nocapture readonly %in1) #0 {
 entry:
-  %0 = load i8, i8 addrspace(1)* %in0, align 1
+  %0 = load i8, ptr addrspace(1) %in0, align 1
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
-  %arrayidx2.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 1
-  %2 = load i8, i8 addrspace(1)* %arrayidx2.i.i, align 1
+  %arrayidx2.i.i = getelementptr inbounds i8, ptr addrspace(1) %in0, i64 1
+  %2 = load i8, ptr addrspace(1) %arrayidx2.i.i, align 1
   %3 = insertelement <8 x i8> %1, i8 %2, i32 1
-  %arrayidx6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 2
-  %4 = load i8, i8 addrspace(1)* %arrayidx6.i.i, align 1
+  %arrayidx6.i.i = getelementptr inbounds i8, ptr addrspace(1) %in0, i64 2
+  %4 = load i8, ptr addrspace(1) %arrayidx6.i.i, align 1
   %5 = insertelement <8 x i8> %3, i8 %4, i32 2
-  %arrayidx10.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 3
-  %6 = load i8, i8 addrspace(1)* %arrayidx10.i.i, align 1
+  %arrayidx10.i.i = getelementptr inbounds i8, ptr addrspace(1) %in0, i64 3
+  %6 = load i8, ptr addrspace(1) %arrayidx10.i.i, align 1
   %7 = insertelement <8 x i8> %5, i8 %6, i32 3
-  %arrayidx.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 4
-  %8 = load i8, i8 addrspace(1)* %arrayidx.i.i, align 1
+  %arrayidx.i.i = getelementptr inbounds i8, ptr addrspace(1) %in0, i64 4
+  %8 = load i8, ptr addrspace(1) %arrayidx.i.i, align 1
   %9 = insertelement <8 x i8> undef, i8 %8, i32 0
-  %arrayidx2.i9.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 5
-  %10 = load i8, i8 addrspace(1)* %arrayidx2.i9.i, align 1
+  %arrayidx2.i9.i = getelementptr inbounds i8, ptr addrspace(1) %in0, i64 5
+  %10 = load i8, ptr addrspace(1) %arrayidx2.i9.i, align 1
   %11 = insertelement <8 x i8> %9, i8 %10, i32 1
-  %arrayidx6.i11.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 6
-  %12 = load i8, i8 addrspace(1)* %arrayidx6.i11.i, align 1
+  %arrayidx6.i11.i = getelementptr inbounds i8, ptr addrspace(1) %in0, i64 6
+  %12 = load i8, ptr addrspace(1) %arrayidx6.i11.i, align 1
   %13 = insertelement <8 x i8> %11, i8 %12, i32 2
-  %arrayidx10.i13.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 7
-  %14 = load i8, i8 addrspace(1)* %arrayidx10.i13.i, align 1
+  %arrayidx10.i13.i = getelementptr inbounds i8, ptr addrspace(1) %in0, i64 7
+  %14 = load i8, ptr addrspace(1) %arrayidx10.i13.i, align 1
   %15 = insertelement <8 x i8> %13, i8 %14, i32 3
   %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  %16 = load i8, i8 addrspace(1)* %in1, align 1
+  %16 = load i8, ptr addrspace(1) %in1, align 1
   %17 = insertelement <8 x i8> undef, i8 %16, i32 0
-  %arrayidx2.i.i4 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 1
-  %18 = load i8, i8 addrspace(1)* %arrayidx2.i.i4, align 1
+  %arrayidx2.i.i4 = getelementptr inbounds i8, ptr addrspace(1) %in1, i64 1
+  %18 = load i8, ptr addrspace(1) %arrayidx2.i.i4, align 1
   %19 = insertelement <8 x i8> %17, i8 %18, i32 1
-  %arrayidx6.i.i5 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 2
-  %20 = load i8, i8 addrspace(1)* %arrayidx6.i.i5, align 1
+  %arrayidx6.i.i5 = getelementptr inbounds i8, ptr addrspace(1) %in1, i64 2
+  %20 = load i8, ptr addrspace(1) %arrayidx6.i.i5, align 1
   %21 = insertelement <8 x i8> %19, i8 %20, i32 2
-  %arrayidx10.i.i6 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 3
-  %22 = load i8, i8 addrspace(1)* %arrayidx10.i.i6, align 1
+  %arrayidx10.i.i6 = getelementptr inbounds i8, ptr addrspace(1) %in1, i64 3
+  %22 = load i8, ptr addrspace(1) %arrayidx10.i.i6, align 1
   %23 = insertelement <8 x i8> %21, i8 %22, i32 3
-  %arrayidx.i.i7 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 4
-  %24 = load i8, i8 addrspace(1)* %arrayidx.i.i7, align 1
+  %arrayidx.i.i7 = getelementptr inbounds i8, ptr addrspace(1) %in1, i64 4
+  %24 = load i8, ptr addrspace(1) %arrayidx.i.i7, align 1
   %25 = insertelement <8 x i8> undef, i8 %24, i32 0
-  %arrayidx2.i9.i8 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 5
-  %26 = load i8, i8 addrspace(1)* %arrayidx2.i9.i8, align 1
+  %arrayidx2.i9.i8 = getelementptr inbounds i8, ptr addrspace(1) %in1, i64 5
+  %26 = load i8, ptr addrspace(1) %arrayidx2.i9.i8, align 1
   %27 = insertelement <8 x i8> %25, i8 %26, i32 1
-  %arrayidx6.i11.i9 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 6
-  %28 = load i8, i8 addrspace(1)* %arrayidx6.i11.i9, align 1
+  %arrayidx6.i11.i9 = getelementptr inbounds i8, ptr addrspace(1) %in1, i64 6
+  %28 = load i8, ptr addrspace(1) %arrayidx6.i11.i9, align 1
   %29 = insertelement <8 x i8> %27, i8 %28, i32 2
-  %arrayidx10.i13.i10 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 7
-  %30 = load i8, i8 addrspace(1)* %arrayidx10.i13.i10, align 1
+  %arrayidx10.i13.i10 = getelementptr inbounds i8, ptr addrspace(1) %in1, i64 7
+  %30 = load i8, ptr addrspace(1) %arrayidx10.i13.i10, align 1
   %31 = insertelement <8 x i8> %29, i8 %30, i32 3
   %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11
   %cond.i = select <8 x i1> %cmp.i, <8 x i8> %vecinit5.i, <8 x i8> %vecinit5.i11
   %32 = extractelement <8 x i8> %cond.i, i32 0
-  store i8 %32, i8 addrspace(1)* %out, align 1
+  store i8 %32, ptr addrspace(1) %out, align 1
   %33 = extractelement <8 x i8> %cond.i, i32 1
-  %arrayidx2.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
-  store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1
+  %arrayidx2.i.i.i = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+  store i8 %33, ptr addrspace(1) %arrayidx2.i.i.i, align 1
   %34 = extractelement <8 x i8> %cond.i, i32 2
-  %arrayidx.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 2
-  store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1
+  %arrayidx.i.i.i = getelementptr inbounds i8, ptr addrspace(1) %out, i64 2
+  store i8 %34, ptr addrspace(1) %arrayidx.i.i.i, align 1
   %35 = extractelement <8 x i8> %cond.i, i32 3
-  %arrayidx2.i6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 3
-  store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1
-  %arrayidx.i.i3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4
+  %arrayidx2.i6.i.i = getelementptr inbounds i8, ptr addrspace(1) %out, i64 3
+  store i8 %35, ptr addrspace(1) %arrayidx2.i6.i.i, align 1
+  %arrayidx.i.i3 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 4
   %36 = extractelement <8 x i8> %cond.i, i32 4
-  store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1
+  store i8 %36, ptr addrspace(1) %arrayidx.i.i3, align 1
   %37 = extractelement <8 x i8> %cond.i, i32 5
-  %arrayidx2.i.i6.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 5
-  store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1
+  %arrayidx2.i.i6.i = getelementptr inbounds i8, ptr addrspace(1) %out, i64 5
+  store i8 %37, ptr addrspace(1) %arrayidx2.i.i6.i, align 1
   %38 = extractelement <8 x i8> %cond.i, i32 6
-  %arrayidx.i.i7.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 6
-  store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1
+  %arrayidx.i.i7.i = getelementptr inbounds i8, ptr addrspace(1) %out, i64 6
+  store i8 %38, ptr addrspace(1) %arrayidx.i.i7.i, align 1
   %39 = extractelement <8 x i8> %cond.i, i32 7
-  %arrayidx2.i6.i8.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 7
-  store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1
+  %arrayidx2.i6.i8.i = getelementptr inbounds i8, ptr addrspace(1) %out, i64 7
+  store i8 %39, ptr addrspace(1) %arrayidx2.i6.i8.i, align 1
   ret void
 }
 
@@ -97,7 +97,7 @@ attributes #0 = { nounwind }
 !0 = !{null}
 !1 = !{null}
 !2 = !{null}
-!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char}
+!3 = !{ptr @test_8_min_char}
 !4 = !{null}
 !5 = !{null}
 !6 = !{null}

diff  --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 95307c84bf94..3c75d3487aa0 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -11,14 +11,14 @@
 ; GCN-PRELINK: call fast float @_Z6sincosfPf(
 ; GCN-NATIVE: call fast float @_Z10native_sinf(
 ; GCN-NATIVE: call fast float @_Z10native_cosf(
-define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_sincos(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3sinf(float %tmp)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   %call2 = call fast float @_Z3cosf(float %tmp)
-  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  store float %call2, ptr addrspace(1) %arrayidx3, align 4
   ret void
 }
 
@@ -32,14 +32,14 @@ declare float @_Z3cosf(float)
 ; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPS_(
 ; GCN-NATIVE: call fast <2 x float> @_Z10native_sinDv2_f(
 ; GCN-NATIVE: call fast <2 x float> @_Z10native_cosDv2_f(
-define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_sincos_v2(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load <2 x float>, <2 x float> addrspace(1)* %a, align 8
+  %tmp = load <2 x float>, ptr addrspace(1) %a, align 8
   %call = call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp)
-  store <2 x float> %call, <2 x float> addrspace(1)* %a, align 8
+  store <2 x float> %call, ptr addrspace(1) %a, align 8
   %call2 = call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp)
-  %arrayidx3 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 1
-  store <2 x float> %call2, <2 x float> addrspace(1)* %arrayidx3, align 8
+  %arrayidx3 = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i64 1
+  store <2 x float> %call2, ptr addrspace(1) %arrayidx3, align 8
   ret void
 }
 
@@ -53,19 +53,17 @@ declare <2 x float> @_Z3cosDv2_f(<2 x float>)
 ; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPS_(
 ; GCN-NATIVE: call fast <3 x float> @_Z10native_sinDv3_f(
 ; GCN-NATIVE: call fast <3 x float> @_Z10native_cosDv3_f(
-define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_sincos_v3(ptr addrspace(1) nocapture %a) {
 entry:
-  %castToVec4 = bitcast <3 x float> addrspace(1)* %a to <4 x float> addrspace(1)*
-  %loadVec4 = load <4 x float>, <4 x float> addrspace(1)* %castToVec4, align 16
+  %loadVec4 = load <4 x float>, ptr addrspace(1) %a, align 16
   %extractVec4 = shufflevector <4 x float> %loadVec4, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
   %call = call fast <3 x float> @_Z3sinDv3_f(<3 x float> %extractVec4)
   %extractVec6 = shufflevector <3 x float> %call, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-  store <4 x float> %extractVec6, <4 x float> addrspace(1)* %castToVec4, align 16
+  store <4 x float> %extractVec6, ptr addrspace(1) %a, align 16
   %call11 = call fast <3 x float> @_Z3cosDv3_f(<3 x float> %extractVec4)
-  %arrayidx12 = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %a, i64 1
+  %arrayidx12 = getelementptr inbounds <3 x float>, ptr addrspace(1) %a, i64 1
   %extractVec13 = shufflevector <3 x float> %call11, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-  %storetmp14 = bitcast <3 x float> addrspace(1)* %arrayidx12 to <4 x float> addrspace(1)*
-  store <4 x float> %extractVec13, <4 x float> addrspace(1)* %storetmp14, align 16
+  store <4 x float> %extractVec13, ptr addrspace(1) %arrayidx12, align 16
   ret void
 }
 
@@ -79,14 +77,14 @@ declare <3 x float> @_Z3cosDv3_f(<3 x float>)
 ; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPS_(
 ; GCN-NATIVE: call fast <4 x float> @_Z10native_sinDv4_f(
 ; GCN-NATIVE: call fast <4 x float> @_Z10native_cosDv4_f(
-define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_sincos_v4(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
+  %tmp = load <4 x float>, ptr addrspace(1) %a, align 16
   %call = call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp)
-  store <4 x float> %call, <4 x float> addrspace(1)* %a, align 16
+  store <4 x float> %call, ptr addrspace(1) %a, align 16
   %call2 = call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp)
-  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i64 1
-  store <4 x float> %call2, <4 x float> addrspace(1)* %arrayidx3, align 16
+  %arrayidx3 = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i64 1
+  store <4 x float> %call2, ptr addrspace(1) %arrayidx3, align 16
   ret void
 }
 
@@ -100,14 +98,14 @@ declare <4 x float> @_Z3cosDv4_f(<4 x float>)
 ; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPS_(
 ; GCN-NATIVE: call fast <8 x float> @_Z10native_sinDv8_f(
 ; GCN-NATIVE: call fast <8 x float> @_Z10native_cosDv8_f(
-define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_sincos_v8(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load <8 x float>, <8 x float> addrspace(1)* %a, align 32
+  %tmp = load <8 x float>, ptr addrspace(1) %a, align 32
   %call = call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp)
-  store <8 x float> %call, <8 x float> addrspace(1)* %a, align 32
+  store <8 x float> %call, ptr addrspace(1) %a, align 32
   %call2 = call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp)
-  %arrayidx3 = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %a, i64 1
-  store <8 x float> %call2, <8 x float> addrspace(1)* %arrayidx3, align 32
+  %arrayidx3 = getelementptr inbounds <8 x float>, ptr addrspace(1) %a, i64 1
+  store <8 x float> %call2, ptr addrspace(1) %arrayidx3, align 32
   ret void
 }
 
@@ -121,14 +119,14 @@ declare <8 x float> @_Z3cosDv8_f(<8 x float>)
 ; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPS_(
 ; GCN-NATIVE: call fast <16 x float> @_Z10native_sinDv16_f(
 ; GCN-NATIVE: call fast <16 x float> @_Z10native_cosDv16_f(
-define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_sincos_v16(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load <16 x float>, <16 x float> addrspace(1)* %a, align 64
+  %tmp = load <16 x float>, ptr addrspace(1) %a, align 64
   %call = call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp)
-  store <16 x float> %call, <16 x float> addrspace(1)* %a, align 64
+  store <16 x float> %call, ptr addrspace(1) %a, align 64
   %call2 = call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp)
-  %arrayidx3 = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %a, i64 1
-  store <16 x float> %call2, <16 x float> addrspace(1)* %arrayidx3, align 64
+  %arrayidx3 = getelementptr inbounds <16 x float>, ptr addrspace(1) %a, i64 1
+  store <16 x float> %call2, ptr addrspace(1) %arrayidx3, align 64
   ret void
 }
 
@@ -137,22 +135,22 @@ declare <16 x float> @_Z3sinDv16_f(<16 x float>)
 declare <16 x float> @_Z3cosDv16_f(<16 x float>)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_recip
-; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
-define amdgpu_kernel void @test_native_recip(float addrspace(1)* nocapture %a) {
+; GCN: store float 0x3FD5555560000000, ptr addrspace(1) %a
+define amdgpu_kernel void @test_native_recip(ptr addrspace(1) nocapture %a) {
 entry:
   %call = call fast float @_Z12native_recipf(float 3.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 declare float @_Z12native_recipf(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_recip
-; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
-define amdgpu_kernel void @test_half_recip(float addrspace(1)* nocapture %a) {
+; GCN: store float 0x3FD5555560000000, ptr addrspace(1) %a
+define amdgpu_kernel void @test_half_recip(ptr addrspace(1) nocapture %a) {
 entry:
   %call = call fast float @_Z10half_recipf(float 3.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -160,11 +158,11 @@ declare float @_Z10half_recipf(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_divide
 ; GCN: fmul fast float %tmp, 0x3FD5555560000000
-define amdgpu_kernel void @test_native_divide(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_native_divide(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -172,129 +170,129 @@ declare float @_Z13native_divideff(float, float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_divide
 ; GCN: fmul fast float %tmp, 0x3FD5555560000000
-define amdgpu_kernel void @test_half_divide(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_half_divide(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 declare float @_Z11half_divideff(float, float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0f
-; GCN: store float 1.000000e+00, float addrspace(1)* %a
-define amdgpu_kernel void @test_pow_0f(float addrspace(1)* nocapture %a) {
+; GCN: store float 1.000000e+00, ptr addrspace(1) %a
+define amdgpu_kernel void @test_pow_0f(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3powff(float %tmp, float 0.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 declare float @_Z3powff(float, float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0i
-; GCN: store float 1.000000e+00, float addrspace(1)* %a
-define amdgpu_kernel void @test_pow_0i(float addrspace(1)* nocapture %a) {
+; GCN: store float 1.000000e+00, ptr addrspace(1) %a
+define amdgpu_kernel void @test_pow_0i(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3powff(float %tmp, float 0.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1f
-; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
-; GCN: store float %tmp, float addrspace(1)* %a, align 4
-define amdgpu_kernel void @test_pow_1f(float addrspace(1)* nocapture %a) {
+; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4
+; GCN: store float %tmp, ptr addrspace(1) %a, align 4
+define amdgpu_kernel void @test_pow_1f(ptr addrspace(1) nocapture %a) {
 entry:
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp = load float, ptr addrspace(1) %arrayidx, align 4
   %call = call fast float @_Z3powff(float %tmp, float 1.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1i
-; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
-; GCN: store float %tmp, float addrspace(1)* %a, align 4
-define amdgpu_kernel void @test_pow_1i(float addrspace(1)* nocapture %a) {
+; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4
+; GCN: store float %tmp, ptr addrspace(1) %a, align 4
+define amdgpu_kernel void @test_pow_1i(ptr addrspace(1) nocapture %a) {
 entry:
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp = load float, ptr addrspace(1) %arrayidx, align 4
   %call = call fast float @_Z3powff(float %tmp, float 1.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2f
-; GCN: %tmp = load float, float addrspace(1)* %a, align 4
+; GCN: %tmp = load float, ptr addrspace(1) %a, align 4
 ; GCN: %__pow2 = fmul fast float %tmp, %tmp
-define amdgpu_kernel void @test_pow_2f(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_pow_2f(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3powff(float %tmp, float 2.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2i
-; GCN: %tmp = load float, float addrspace(1)* %a, align 4
+; GCN: %tmp = load float, ptr addrspace(1) %a, align 4
 ; GCN: %__pow2 = fmul fast float %tmp, %tmp
-define amdgpu_kernel void @test_pow_2i(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_pow_2i(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3powff(float %tmp, float 2.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1f
-; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 ; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
-define amdgpu_kernel void @test_pow_m1f(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_pow_m1f(ptr addrspace(1) nocapture %a) {
 entry:
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp = load float, ptr addrspace(1) %arrayidx, align 4
   %call = call fast float @_Z3powff(float %tmp, float -1.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1i
-; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4
 ; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
-define amdgpu_kernel void @test_pow_m1i(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_pow_m1i(ptr addrspace(1) nocapture %a) {
 entry:
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp = load float, ptr addrspace(1) %arrayidx, align 4
   %call = call fast float @_Z3powff(float %tmp, float -1.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half
 ; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 5.000000e-01)
 ; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
-define amdgpu_kernel void @test_pow_half(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_pow_half(ptr addrspace(1) nocapture %a) {
 entry:
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp = load float, ptr addrspace(1) %arrayidx, align 4
   %call = call fast float @_Z3powff(float %tmp, float 5.000000e-01)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_mhalf
 ; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float -5.000000e-01)
 ; GCN-PRELINK: %__pow2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
-define amdgpu_kernel void @test_pow_mhalf(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_pow_mhalf(ptr addrspace(1) nocapture %a) {
 entry:
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp = load float, ptr addrspace(1) %arrayidx, align 4
   %call = call fast float @_Z3powff(float %tmp, float -5.000000e-01)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -304,12 +302,12 @@ entry:
 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
 ; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
 ; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
-define amdgpu_kernel void @test_pow_c(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_pow_c(ptr addrspace(1) nocapture %a) {
 entry:
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp = load float, ptr addrspace(1) %arrayidx, align 4
   %call = call fast float @_Z3powff(float %tmp, float 1.100000e+01)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -319,12 +317,12 @@ entry:
 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
 ; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
 ; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
-define amdgpu_kernel void @test_powr_c(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_powr_c(ptr addrspace(1) nocapture %a) {
 entry:
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp = load float, ptr addrspace(1) %arrayidx, align 4
   %call = call fast float @_Z4powrff(float %tmp, float 1.100000e+01)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -336,12 +334,12 @@ declare float @_Z4powrff(float, float)
 ; GCN: %__powx22 = fmul fast float %__powx2, %tmp
 ; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
 ; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
-define amdgpu_kernel void @test_pown_c(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_pown_c(ptr addrspace(1) nocapture %a) {
 entry:
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp = load float, ptr addrspace(1) %arrayidx, align 4
   %call = call fast float @_Z4pownfi(float %tmp, i32 11)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -357,13 +355,12 @@ declare float @_Z4pownfi(float, i32)
 ; GCN-PRELINK: %__pow_sign = and i32 %[[r0]], -2147483648
 ; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
 ; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
-; GCN-PRELINK: %[[r3:.*]] = bitcast float addrspace(1)* %a to i32 addrspace(1)*
-; GCN-PRELINK: store i32 %[[r2]], i32 addrspace(1)* %[[r3]], align 4
-define amdgpu_kernel void @test_pow(float addrspace(1)* nocapture %a) {
+; GCN-PRELINK: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3powff(float %tmp, float 1.013000e+03)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -372,18 +369,18 @@ entry:
 ; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp)
 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1
 ; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
-; GCN-PRELINK: store float %__exp2, float addrspace(1)* %a, align 4
+; GCN-PRELINK: store float %__exp2, ptr addrspace(1) %a, align 4
 ; GCN-NATIVE:  %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
 ; GCN-NATIVE:  %__ylogx = fmul fast float %__log2, %tmp1
 ; GCN-NATIVE:  %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
-; GCN-NATIVE:  store float %__exp2, float addrspace(1)* %a, align 4
-define amdgpu_kernel void @test_powr(float addrspace(1)* nocapture %a) {
+; GCN-NATIVE:  store float %__exp2, ptr addrspace(1) %a, align 4
+define amdgpu_kernel void @test_powr(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
-  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4
   %call = call fast float @_Z4powrff(float %tmp, float %tmp1)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -400,28 +397,27 @@ entry:
 ; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %[[r0]]
 ; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
 ; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
-; GCN-PRELINK: %[[r3:.*]] = bitcast float addrspace(1)* %a to i32 addrspace(1)*
-; GCN-PRELINK: store i32 %[[r2]], i32 addrspace(1)* %[[r3]], align 4
-define amdgpu_kernel void @test_pown(float addrspace(1)* nocapture %a) {
+; GCN-PRELINK: store i32 %[[r2]], ptr addrspace(1) %a, align 4
+define amdgpu_kernel void @test_pown(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
-  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4
   %conv = fptosi float %tmp1 to i32
   %call = call fast float @_Z4pownfi(float %tmp, i32 %conv)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_1
-; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
-; GCN: store float %tmp, float addrspace(1)* %a, align 4
-define amdgpu_kernel void @test_rootn_1(float addrspace(1)* nocapture %a) {
+; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4
+; GCN: store float %tmp, ptr addrspace(1) %a, align 4
+define amdgpu_kernel void @test_rootn_1(ptr addrspace(1) nocapture %a) {
 entry:
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp = load float, ptr addrspace(1) %arrayidx, align 4
   %call = call fast float @_Z5rootnfi(float %tmp, i32 1)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -430,129 +426,129 @@ declare float @_Z5rootnfi(float, i32)
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2
 ; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 2)
 ; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
-define amdgpu_kernel void @test_rootn_2(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_rootn_2(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z5rootnfi(float %tmp, i32 2)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_3
 ; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 3)
 ; GCN-PRELINK: %__rootn2cbrt = tail call fast float @_Z4cbrtf(float %tmp)
-define amdgpu_kernel void @test_rootn_3(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_rootn_3(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z5rootnfi(float %tmp, i32 3)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m1
 ; GCN: fdiv fast float 1.000000e+00, %tmp
-define amdgpu_kernel void @test_rootn_m1(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_rootn_m1(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z5rootnfi(float %tmp, i32 -1)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
 ; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 -2)
 ; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
-define amdgpu_kernel void @test_rootn_m2(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_rootn_m2(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z5rootnfi(float %tmp, i32 -2)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_0x
-; GCN: store float %y, float addrspace(1)* %a
-define amdgpu_kernel void @test_fma_0x(float addrspace(1)* nocapture %a, float %y) {
+; GCN: store float %y, ptr addrspace(1) %a
+define amdgpu_kernel void @test_fma_0x(ptr addrspace(1) nocapture %a, float %y) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 declare float @_Z3fmafff(float, float, float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x0
-; GCN: store float %y, float addrspace(1)* %a
-define amdgpu_kernel void @test_fma_x0(float addrspace(1)* nocapture %a, float %y) {
+; GCN: store float %y, ptr addrspace(1) %a
+define amdgpu_kernel void @test_fma_x0(ptr addrspace(1) nocapture %a, float %y) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_0x
-; GCN: store float %y, float addrspace(1)* %a
-define amdgpu_kernel void @test_mad_0x(float addrspace(1)* nocapture %a, float %y) {
+; GCN: store float %y, ptr addrspace(1) %a
+define amdgpu_kernel void @test_mad_0x(ptr addrspace(1) nocapture %a, float %y) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 declare float @_Z3madfff(float, float, float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_x0
-; GCN: store float %y, float addrspace(1)* %a
-define amdgpu_kernel void @test_mad_x0(float addrspace(1)* nocapture %a, float %y) {
+; GCN: store float %y, ptr addrspace(1) %a
+define amdgpu_kernel void @test_mad_x0(ptr addrspace(1) nocapture %a, float %y) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x1y
 ; GCN: %fmaadd = fadd fast float %tmp, %y
-define amdgpu_kernel void @test_fma_x1y(float addrspace(1)* nocapture %a, float %y) {
+define amdgpu_kernel void @test_fma_x1y(ptr addrspace(1) nocapture %a, float %y) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_1xy
 ; GCN: %fmaadd = fadd fast float %tmp, %y
-define amdgpu_kernel void @test_fma_1xy(float addrspace(1)* nocapture %a, float %y) {
+define amdgpu_kernel void @test_fma_1xy(ptr addrspace(1) nocapture %a, float %y) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_xy0
 ; GCN: %fmamul = fmul fast float %tmp1, %tmp
-define amdgpu_kernel void @test_fma_xy0(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_fma_xy0(ptr addrspace(1) nocapture %a) {
 entry:
-  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %tmp1 = load float, float addrspace(1)* %a, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp = load float, ptr addrspace(1) %arrayidx, align 4
+  %tmp1 = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp
 ; GCN-NATIVE: call fast float @_Z10native_expf(float %tmp)
-define amdgpu_kernel void @test_use_native_exp(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_use_native_exp(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3expf(float %tmp)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -560,11 +556,11 @@ declare float @_Z3expf(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp2
 ; GCN-NATIVE: call fast float @_Z11native_exp2f(float %tmp)
-define amdgpu_kernel void @test_use_native_exp2(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_use_native_exp2(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z4exp2f(float %tmp)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -572,11 +568,11 @@ declare float @_Z4exp2f(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp10
 ; GCN-NATIVE: call fast float @_Z12native_exp10f(float %tmp)
-define amdgpu_kernel void @test_use_native_exp10(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_use_native_exp10(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z5exp10f(float %tmp)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -584,11 +580,11 @@ declare float @_Z5exp10f(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log
 ; GCN-NATIVE: call fast float @_Z10native_logf(float %tmp)
-define amdgpu_kernel void @test_use_native_log(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_use_native_log(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3logf(float %tmp)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -596,11 +592,11 @@ declare float @_Z3logf(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log2
 ; GCN-NATIVE: call fast float @_Z11native_log2f(float %tmp)
-define amdgpu_kernel void @test_use_native_log2(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_use_native_log2(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z4log2f(float %tmp)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -608,49 +604,49 @@ declare float @_Z4log2f(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log10
 ; GCN-NATIVE: call fast float @_Z12native_log10f(float %tmp)
-define amdgpu_kernel void @test_use_native_log10(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_use_native_log10(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z5log10f(float %tmp)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 declare float @_Z5log10f(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr
-; GCN-NATIVE: %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+; GCN-NATIVE: %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4
 ; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
 ; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1
 ; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
-; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4
-define amdgpu_kernel void @test_use_native_powr(float addrspace(1)* nocapture %a) {
+; GCN-NATIVE: store float %__exp2, ptr addrspace(1) %a, align 4
+define amdgpu_kernel void @test_use_native_powr(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
-  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4
   %call = call fast float @_Z4powrff(float %tmp, float %tmp1)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sqrt
 ; GCN-NATIVE: call fast float @_Z11native_sqrtf(float %tmp)
-define amdgpu_kernel void @test_use_native_sqrt(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_use_native_sqrt(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z4sqrtf(float %tmp)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64
 ; GCN: call fast double @_Z4sqrtd(double %tmp)
-define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64(double addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load double, double addrspace(1)* %a, align 8
+  %tmp = load double, ptr addrspace(1) %a, align 8
   %call = call fast double @_Z4sqrtd(double %tmp)
-  store double %call, double addrspace(1)* %a, align 8
+  store double %call, ptr addrspace(1) %a, align 8
   ret void
 }
 
@@ -659,11 +655,11 @@ declare double @_Z4sqrtd(double)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_rsqrt
 ; GCN-NATIVE: call fast float @_Z12native_rsqrtf(float %tmp)
-define amdgpu_kernel void @test_use_native_rsqrt(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_use_native_rsqrt(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z5rsqrtf(float %tmp)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -671,11 +667,11 @@ declare float @_Z5rsqrtf(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_tan
 ; GCN-NATIVE: call fast float @_Z10native_tanf(float %tmp)
-define amdgpu_kernel void @test_use_native_tan(float addrspace(1)* nocapture %a) {
+define amdgpu_kernel void @test_use_native_tan(ptr addrspace(1) nocapture %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
   %call = call fast float @_Z3tanf(float %tmp)
-  store float %call, float addrspace(1)* %a, align 4
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
@@ -684,105 +680,95 @@ declare float @_Z3tanf(float)
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sincos
 ; GCN-NATIVE: call float @_Z10native_sinf(float %tmp)
 ; GCN-NATIVE: call float @_Z10native_cosf(float %tmp)
-define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) {
+define amdgpu_kernel void @test_use_native_sincos(ptr addrspace(1) %a) {
 entry:
-  %tmp = load float, float addrspace(1)* %a, align 4
-  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
-  %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float*
-  %call = call fast float @_Z6sincosfPf(float %tmp, float* %tmp1)
-  store float %call, float addrspace(1)* %a, align 4
+  %tmp = load float, ptr addrspace(1) %a, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
+  %tmp1 = addrspacecast ptr addrspace(1) %arrayidx1 to ptr
+  %call = call fast float @_Z6sincosfPf(float %tmp, ptr %tmp1)
+  store float %call, ptr addrspace(1) %a, align 4
   ret void
 }
 
-declare float @_Z6sincosfPf(float, float*)
+declare float @_Z6sincosfPf(float, ptr)
 
 %opencl.pipe_t = type opaque
 %opencl.reserve_id_t = type opaque
 
-; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
-; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[$NOUNWIND:[0-9]+]]
-; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[$NOUNWIND]]
-define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr)
+; GCN-PRELINK: call i32 @__read_pipe_2_4(ptr addrspace(1) %{{.*}}, ptr %{{.*}}) #[[$NOUNWIND:[0-9]+]]
+; GCN-PRELINK: call i32 @__read_pipe_4_4(ptr addrspace(1) %{{.*}}, ptr addrspace(5) %{{.*}}, i32 2, ptr %{{.*}}) #[[$NOUNWIND]]
+define amdgpu_kernel void @test_read_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr) local_unnamed_addr {
 entry:
-  %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
-  %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
-  %tmp2 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
-  %tmp3 = call %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
-  %tmp4 = call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
-  call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4)
+  %tmp1 = addrspacecast ptr addrspace(1) %ptr to ptr
+  %tmp2 = call i32 @__read_pipe_2(ptr addrspace(1) %p, ptr %tmp1, i32 4, i32 4) #0
+  %tmp3 = call ptr addrspace(5) @__reserve_read_pipe(ptr addrspace(1) %p, i32 2, i32 4, i32 4)
+  %tmp4 = call i32 @__read_pipe_4(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 2, ptr %tmp1, i32 4, i32 4) #0
+  call void @__commit_read_pipe(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 4, i32 4)
   ret void
 }
 
-declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32)
+declare i32 @__read_pipe_2(ptr addrspace(1), ptr, i32, i32)
 
-declare %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32)
+declare ptr addrspace(5) @__reserve_read_pipe(ptr addrspace(1), i32, i32, i32)
 
-declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32)
+declare i32 @__read_pipe_4(ptr addrspace(1), ptr addrspace(5), i32, ptr, i32, i32)
 
-declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32)
+declare void @__commit_read_pipe(ptr addrspace(1), ptr addrspace(5), i32, i32)
 
-; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
-; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32* %{{.*}}) #[[$NOUNWIND]]
-; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t addrspace(5)* %{{.*}}, i32 2, i32* %{{.*}}) #[[$NOUNWIND]]
-define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr)
+; GCN-PRELINK: call i32 @__write_pipe_2_4(ptr addrspace(1) %{{.*}}, ptr %{{.*}}) #[[$NOUNWIND]]
+; GCN-PRELINK: call i32 @__write_pipe_4_4(ptr addrspace(1) %{{.*}}, ptr addrspace(5) %{{.*}}, i32 2, ptr %{{.*}}) #[[$NOUNWIND]]
+define amdgpu_kernel void @test_write_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr) local_unnamed_addr {
 entry:
-  %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
-  %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
-  %tmp2 = call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
-  %tmp3 = call %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
-  %tmp4 = call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
-  call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4) #0
+  %tmp1 = addrspacecast ptr addrspace(1) %ptr to ptr
+  %tmp2 = call i32 @__write_pipe_2(ptr addrspace(1) %p, ptr %tmp1, i32 4, i32 4) #0
+  %tmp3 = call ptr addrspace(5) @__reserve_write_pipe(ptr addrspace(1) %p, i32 2, i32 4, i32 4) #0
+  %tmp4 = call i32 @__write_pipe_4(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 2, ptr %tmp1, i32 4, i32 4) #0
+  call void @__commit_write_pipe(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 4, i32 4) #0
   ret void
 }
 
-declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8*, i32, i32) local_unnamed_addr
+declare i32 @__write_pipe_2(ptr addrspace(1), ptr, i32, i32) local_unnamed_addr
 
-declare %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr
+declare ptr addrspace(5) @__reserve_write_pipe(ptr addrspace(1), i32, i32, i32) local_unnamed_addr
 
-declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i8*, i32, i32) local_unnamed_addr
+declare i32 @__write_pipe_4(ptr addrspace(1), ptr addrspace(5), i32, ptr, i32, i32) local_unnamed_addr
 
-declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t addrspace(5)*, i32, i32) local_unnamed_addr
+declare void @__commit_write_pipe(ptr addrspace(1), ptr addrspace(5), i32, i32) local_unnamed_addr
 
 %struct.S = type { [100 x i32] }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pipe_size
-; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8* %{{.*}}) #[[$NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16* %{{.*}}) #[[$NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32* %{{.*}}) #[[$NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64* %{{.*}}) #[[$NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64>* %{{.*}}) #[[$NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64>* %{{.*}} #[[$NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64>* %{{.*}} #[[$NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64>* %{{.*}} #[[$NOUNWIND]]
-; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8* %{{.*}} i32 400, i32 4) #[[$NOUNWIND]]
-define amdgpu_kernel void @test_pipe_size(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(1)* %ptr1, %opencl.pipe_t addrspace(1)* %p2, i16 addrspace(1)* %ptr2, %opencl.pipe_t addrspace(1)* %p4, i32 addrspace(1)* %ptr4, %opencl.pipe_t addrspace(1)* %p8, i64 addrspace(1)* %ptr8, %opencl.pipe_t addrspace(1)* %p16, <2 x i64> addrspace(1)* %ptr16, %opencl.pipe_t addrspace(1)* %p32, <4 x i64> addrspace(1)* %ptr32, %opencl.pipe_t addrspace(1)* %p64, <8 x i64> addrspace(1)* %ptr64, %opencl.pipe_t addrspace(1)* %p128, <16 x i64> addrspace(1)* %ptr128, %opencl.pipe_t addrspace(1)* %pu, %struct.S addrspace(1)* %ptru) local_unnamed_addr #0 {
-entry:
-  %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8*
-  %tmp1 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8* %tmp, i32 1, i32 1) #0
-  %tmp2 = bitcast i16 addrspace(1)* %ptr2 to i8 addrspace(1)*
-  %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8*
-  %tmp4 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8* %tmp3, i32 2, i32 2) #0
-  %tmp5 = bitcast i32 addrspace(1)* %ptr4 to i8 addrspace(1)*
-  %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8*
-  %tmp7 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8* %tmp6, i32 4, i32 4) #0
-  %tmp8 = bitcast i64 addrspace(1)* %ptr8 to i8 addrspace(1)*
-  %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8*
-  %tmp10 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8* %tmp9, i32 8, i32 8) #0
-  %tmp11 = bitcast <2 x i64> addrspace(1)* %ptr16 to i8 addrspace(1)*
-  %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8*
-  %tmp13 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8* %tmp12, i32 16, i32 16) #0
-  %tmp14 = bitcast <4 x i64> addrspace(1)* %ptr32 to i8 addrspace(1)*
-  %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8*
-  %tmp16 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8* %tmp15, i32 32, i32 32) #0
-  %tmp17 = bitcast <8 x i64> addrspace(1)* %ptr64 to i8 addrspace(1)*
-  %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8*
-  %tmp19 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8* %tmp18, i32 64, i32 64) #0
-  %tmp20 = bitcast <16 x i64> addrspace(1)* %ptr128 to i8 addrspace(1)*
-  %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8*
-  %tmp22 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8* %tmp21, i32 128, i32 128) #0
-  %tmp23 = bitcast %struct.S addrspace(1)* %ptru to i8 addrspace(1)*
-  %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8*
-  %tmp25 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8* %tmp24, i32 400, i32 4) #0
+; GCN-PRELINK: call i32 @__read_pipe_2_1(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_2(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_4(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_8(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_16(ptr addrspace(1) %{{.*}}, ptr %{{.*}}) #[[$NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_32(ptr addrspace(1) %{{.*}}, ptr %{{.*}} #[[$NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_64(ptr addrspace(1) %{{.*}}, ptr %{{.*}} #[[$NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_128(ptr addrspace(1) %{{.*}}, ptr %{{.*}} #[[$NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2(ptr addrspace(1) %{{.*}}, ptr %{{.*}} i32 400, i32 4) #[[$NOUNWIND]]
+define amdgpu_kernel void @test_pipe_size(ptr addrspace(1) %p1, ptr addrspace(1) %ptr1, ptr addrspace(1) %p2, ptr addrspace(1) %ptr2, ptr addrspace(1) %p4, ptr addrspace(1) %ptr4, ptr addrspace(1) %p8, ptr addrspace(1) %ptr8, ptr addrspace(1) %p16, ptr addrspace(1) %ptr16, ptr addrspace(1) %p32, ptr addrspace(1) %ptr32, ptr addrspace(1) %p64, ptr addrspace(1) %ptr64, ptr addrspace(1) %p128, ptr addrspace(1) %ptr128, ptr addrspace(1) %pu, ptr addrspace(1) %ptru) local_unnamed_addr #0 {
+entry:
+  %tmp = addrspacecast ptr addrspace(1) %ptr1 to ptr
+  %tmp1 = call i32 @__read_pipe_2(ptr addrspace(1) %p1, ptr %tmp, i32 1, i32 1) #0
+  %tmp3 = addrspacecast ptr addrspace(1) %ptr2 to ptr
+  %tmp4 = call i32 @__read_pipe_2(ptr addrspace(1) %p2, ptr %tmp3, i32 2, i32 2) #0
+  %tmp6 = addrspacecast ptr addrspace(1) %ptr4 to ptr
+  %tmp7 = call i32 @__read_pipe_2(ptr addrspace(1) %p4, ptr %tmp6, i32 4, i32 4) #0
+  %tmp9 = addrspacecast ptr addrspace(1) %ptr8 to ptr
+  %tmp10 = call i32 @__read_pipe_2(ptr addrspace(1) %p8, ptr %tmp9, i32 8, i32 8) #0
+  %tmp12 = addrspacecast ptr addrspace(1) %ptr16 to ptr
+  %tmp13 = call i32 @__read_pipe_2(ptr addrspace(1) %p16, ptr %tmp12, i32 16, i32 16) #0
+  %tmp15 = addrspacecast ptr addrspace(1) %ptr32 to ptr
+  %tmp16 = call i32 @__read_pipe_2(ptr addrspace(1) %p32, ptr %tmp15, i32 32, i32 32) #0
+  %tmp18 = addrspacecast ptr addrspace(1) %ptr64 to ptr
+  %tmp19 = call i32 @__read_pipe_2(ptr addrspace(1) %p64, ptr %tmp18, i32 64, i32 64) #0
+  %tmp21 = addrspacecast ptr addrspace(1) %ptr128 to ptr
+  %tmp22 = call i32 @__read_pipe_2(ptr addrspace(1) %p128, ptr %tmp21, i32 128, i32 128) #0
+  %tmp24 = addrspacecast ptr addrspace(1) %ptru to ptr
+  %tmp25 = call i32 @__read_pipe_2(ptr addrspace(1) %pu, ptr %tmp24, i32 400, i32 4) #0
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls2.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls2.ll
index 50a34fac05a6..c1a25b678897 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls2.ll
@@ -2,19 +2,19 @@
 ; RUN: opt -S -amdgpu-simplifylib -debug-only=amdgpu-simplifylib -mtriple=amdgcn-unknown-amdhsa -disable-output < %s 2>&1 | FileCheck %s
 ; RUN: opt -S -passes=amdgpu-simplifylib -debug-only=amdgpu-simplifylib -mtriple=amdgcn-unknown-amdhsa -disable-output < %s 2>&1 | FileCheck %s
 
-; CHECK-NOT: AMDIC: try folding   call void @llvm.lifetime.start.p0i8
-; CHECK-NOT: AMDIC: try folding   call void @llvm.lifetime.end.p0i8
+; CHECK-NOT: AMDIC: try folding   call void @llvm.lifetime.start.p0
+; CHECK-NOT: AMDIC: try folding   call void @llvm.lifetime.end.p0
 ; CHECK-NOT: AMDIC: try folding   call void @llvm.dbg.value
 
 define void @foo(i32 %i) {
-  call void @llvm.lifetime.start.p0i8(i64 1, i8* undef)
-  call void @llvm.lifetime.end.p0i8(i64 1, i8* undef)
+  call void @llvm.lifetime.start.p0(i64 1, ptr undef)
+  call void @llvm.lifetime.end.p0(i64 1, ptr undef)
   call void @llvm.dbg.value(metadata i32 undef, metadata !DILocalVariable(name: "1", scope: !2), metadata !DIExpression()), !dbg !3
   ret void
 }
 
-declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
-declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.module.flags = !{!1}

diff  --git a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll
index f8077cd8e3ab..ed074863c39e 100644
--- a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll
@@ -19,7 +19,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare float @llvm.fmuladd.f32(float, float, float) #0
 
 ; CHECK: s_endpgm
-define amdgpu_kernel void @foo(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1, float addrspace(1)* noalias nocapture %arg2, float %arg3) local_unnamed_addr !reqd_work_group_size !0 {
+define amdgpu_kernel void @foo(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture readonly %arg1, ptr addrspace(1) noalias nocapture %arg2, float %arg3) local_unnamed_addr !reqd_work_group_size !0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp4 = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -28,7 +28,6 @@ bb:
   %tmp7 = sub i32 %tmp6, 0
   %tmp8 = add i32 %tmp7, 0
   %tmp9 = add i32 %tmp8, 0
-  %tmp10 = getelementptr inbounds [462 x float], [462 x float] addrspace(3)* @0, i32 0, i32 0
   br label %bb12
 
 bb11:                                             ; preds = %bb30
@@ -58,8 +57,8 @@ bb17:                                             ; preds = %bb13
 bb21:                                             ; preds = %bb21, %bb17
   %tmp22 = phi i32 [ %tmp4, %bb17 ], [ %tmp25, %bb21 ]
   %tmp23 = add i32 %tmp22, %tmp16
-  %tmp24 = getelementptr inbounds float, float addrspace(3)* %tmp10, i32 %tmp23
-  store float undef, float addrspace(3)* %tmp24, align 4
+  %tmp24 = getelementptr inbounds float, ptr addrspace(3) @0, i32 %tmp23
+  store float undef, ptr addrspace(3) %tmp24, align 4
   %tmp25 = add nuw i32 %tmp22, 8
   br i1 undef, label %bb21, label %.loopexit
 
@@ -77,8 +76,8 @@ bb30:                                             ; preds = %bb31
 
 bb31:                                             ; preds = %bb31, %bb26
   %tmp32 = phi i32 [ %tmp9, %bb26 ], [ undef, %bb31 ]
-  %tmp33 = getelementptr inbounds [462 x float], [462 x float] addrspace(3)* @0, i32 0, i32 %tmp32
-  %tmp34 = load float, float addrspace(3)* %tmp33, align 4
+  %tmp33 = getelementptr inbounds [462 x float], ptr addrspace(3) @0, i32 0, i32 %tmp32
+  %tmp34 = load float, ptr addrspace(3) %tmp33, align 4
   %tmp35 = tail call float @llvm.fmuladd.f32(float %tmp34, float undef, float undef)
   %tmp36 = tail call float @llvm.fmuladd.f32(float undef, float undef, float %tmp35)
   br i1 undef, label %bb30, label %bb31

diff  --git a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
index 6697211fd75b..5b4e5e66edde 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -12,14 +12,14 @@
 ; GCN: buffer_store_dword
 ; GCN: [[EXIT]]:
 ; GCN: s_endpgm
-define amdgpu_kernel void @vccz_workaround(i32 addrspace(4)* %in, i32 addrspace(1)* %out, float %cond) {
+define amdgpu_kernel void @vccz_workaround(ptr addrspace(4) %in, ptr addrspace(1) %out, float %cond) {
 entry:
   %cnd = fcmp oeq float 0.0, %cond
-  %sgpr = load volatile i32, i32 addrspace(4)* %in
+  %sgpr = load volatile i32, ptr addrspace(4) %in
   br i1 %cnd, label %if, label %endif
 
 if:
-  store i32 %sgpr, i32 addrspace(1)* %out
+  store i32 %sgpr, ptr addrspace(1) %out
   br label %endif
 
 endif:
@@ -34,14 +34,14 @@ endif:
 ; GCN: buffer_store_dword
 ; GCN: [[EXIT]]:
 ; GCN: s_endpgm
-define amdgpu_kernel void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
+define amdgpu_kernel void @vccz_noworkaround(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 entry:
-  %vgpr = load volatile float, float addrspace(1)* %in
+  %vgpr = load volatile float, ptr addrspace(1) %in
   %cnd = fcmp oeq float 0.0, %vgpr
   br i1 %cnd, label %if, label %endif
 
 if:
-  store float %vgpr, float addrspace(1)* %out
+  store float %vgpr, ptr addrspace(1) %out
   br label %endif
 
 endif:

diff  --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
index 10e243c3dca4..6312816a40c2 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
@@ -5,19 +5,19 @@
 ; GCN: s_waitcnt lgkmcnt(0)
 ; GCN: global_store_dword v
 
-define amdgpu_kernel void @zot(i32 addrspace(1)* nocapture %arg, i64 addrspace(1)* nocapture %arg1) {
+define amdgpu_kernel void @zot(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture %arg1) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = icmp eq i32 %tmp, 0
   br i1 %tmp2, label %bb3, label %bb8
 
 bb3:                                              ; preds = %bb
-  %tmp4 = load i32, i32 addrspace(1)* %arg, align 4
-  store i32 0, i32 addrspace(1)* %arg, align 4
+  %tmp4 = load i32, ptr addrspace(1) %arg, align 4
+  store i32 0, ptr addrspace(1) %arg, align 4
   %tmp5 = zext i32 %tmp4 to i64
-  %tmp6 = load i64, i64 addrspace(1)* %arg1, align 8
+  %tmp6 = load i64, ptr addrspace(1) %arg1, align 8
   %tmp7 = add i64 %tmp6, %tmp5
-  store i64 %tmp7, i64 addrspace(1)* %arg1, align 8
+  store i64 %tmp7, ptr addrspace(1) %arg1, align 8
   br label %bb8
 
 bb8:                                              ; preds = %bb3, %bb

diff  --git a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll
index 7b9d1e228d08..628af58353ba 100644
--- a/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll
+++ b/llvm/test/CodeGen/AMDGPU/sram-ecc-default.ll
@@ -13,12 +13,12 @@
 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
 ; NO-ECC: global_load_short_d16_hi
 ; ECC: global_load_ushort
-define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) {
+define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) {
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
-  %load = load i16, i16 addrspace(1)* %gep
+  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 -2047
+  %load = load i16, ptr addrspace(1) %gep
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
-  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+  store <2 x i16> %build1, ptr addrspace(1) undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
index 47839dcb9344..5651d1c922cc 100644
--- a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
@@ -16,11 +16,11 @@ target datalayout = "A5"
 ; LOOP: br i1 %{{[^,]+}}, label %exit, label %loop.header
 
 ; FULL-UNROLL: alloca
-; FULL-UNROLL-COUNT-256: store i32 {{[0-9]+}}, i32 addrspace(5)*
+; FULL-UNROLL-COUNT-256: store i32 {{[0-9]+}}, ptr addrspace(5)
 ; FULL-UNROLL-NOT: br
 
-; FUNC: store i32 %{{[^,]+}}, i32 addrspace(1)* %out
-define amdgpu_kernel void @private_memory(i32 addrspace(1)* %out, i32 %n) {
+; FUNC: store i32 %{{[^,]+}}, ptr addrspace(1) %out
+define amdgpu_kernel void @private_memory(ptr addrspace(1) %out, i32 %n) {
 entry:
   %alloca = alloca [16 x i32], addrspace(5)
   br label %loop.header
@@ -32,8 +32,8 @@ loop.header:
 loop.body:
   %salt = xor i32 %counter, %n
   %idx = and i32 %salt, 15
-  %ptr = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %idx
-  store i32 %counter, i32 addrspace(5)* %ptr
+  %ptr = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %idx
+  store i32 %counter, ptr addrspace(5) %ptr
   br label %loop.inc
 
 loop.inc:
@@ -42,8 +42,8 @@ loop.inc:
   br i1 %cmp, label  %exit, label %loop.header
 
 exit:
-  %gep = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %n
-  %load = load i32, i32 addrspace(5)* %gep
-  store i32 %load, i32 addrspace(1)* %out
+  %gep = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %n
+  %load = load i32, ptr addrspace(5) %gep
+  store i32 %load, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll
index 4f4bab84150d..2568db945133 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll
@@ -8,47 +8,47 @@ declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
 declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
 ; FUNC-LABEL: {{^}}ssubo_i64_zext:
-define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
   %ext = zext i1 %carry to i64
   %add2 = add i64 %val, %ext
-  store i64 %add2, i64 addrspace(1)* %out, align 8
+  store i64 %add2, ptr addrspace(1) %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ssubo_i32:
-define amdgpu_kernel void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind {
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %ssub, 0
   %carry = extractvalue { i32, i1 } %ssub, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_ssubo_i32:
-define amdgpu_kernel void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+  %a = load i32, ptr addrspace(1) %aptr, align 4
+  %b = load i32, ptr addrspace(1) %bptr, align 4
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %ssub, 0
   %carry = extractvalue { i32, i1 } %ssub, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ssubo_i64:
 ; GCN: s_sub_u32
 ; GCN: s_subb_u32
-define amdgpu_kernel void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out, align 8
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -61,14 +61,14 @@ define amdgpu_kernel void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 
 ; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
 ; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
-define amdgpu_kernel void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
-  %a = load i64, i64 addrspace(1)* %aptr, align 4
-  %b = load i64, i64 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+  %a = load i64, ptr addrspace(1) %aptr, align 4
+  %b = load i64, ptr addrspace(1) %bptr, align 4
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out, align 8
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -79,14 +79,14 @@ define amdgpu_kernel void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; SICIVI: v_cmp_lt_i32
 ; SICIVI: v_cmp_lt_i32
 ; SICIVI: v_sub_{{[iu]}}32
-define amdgpu_kernel void @v_ssubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
-  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
-  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_ssubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+  %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
+  %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
   %sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
   %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
   %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
-  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  store <2 x i32> %val, ptr addrspace(1) %out, align 4
   %carry.ext = zext <2 x i1> %carry to <2 x i32>
-  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+  store <2 x i32> %carry.ext, ptr addrspace(1) %carryout
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index fc75faecdc9b..31aee3f6a507 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -7,7 +7,7 @@
 ; During instruction selection, we use immediate const zero for soffset in
 ; MUBUF stack accesses and let eliminateFrameIndex to fix up this field to use
 ; the correct frame register whenever required.
-define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) {
+define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr addrspace(1) %input, ptr addrspace(1) %output, i32 %i) {
 ; MUBUF-LABEL: kernel_background_evaluate:
 ; MUBUF:       ; %bb.0: ; %entry
 ; MUBUF-NEXT:    s_load_dword s0, s[0:1], 0x24
@@ -136,26 +136,26 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
 entry:
   %sd = alloca < 1339 x i32>, align 8192, addrspace(5)
   %state = alloca <4 x i32>, align 16, addrspace(5)
-  %rslt = call i32 @svm_eval_nodes(float addrspace(5)* %kg, <1339 x i32> addrspace(5)* %sd, <4 x i32> addrspace(5)* %state, i32 0, i32 4194304)
+  %rslt = call i32 @svm_eval_nodes(ptr addrspace(5) %kg, ptr addrspace(5) %sd, ptr addrspace(5) %state, i32 0, i32 4194304)
   %cmp = icmp eq i32 %rslt, 0
   br i1 %cmp, label %shader_eval_surface.exit, label %if.then4.i
 
 if.then4.i:                                       ; preds = %entry
-  %rng_hash.i.i = getelementptr inbounds < 4 x i32>, <4 x i32> addrspace(5)* %state, i32 0, i32 1
-  %tmp0 = load i32, i32 addrspace(5)* %rng_hash.i.i, align 4
-  %rng_offset.i.i = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* %state, i32 0, i32 2
-  %tmp1 = load i32, i32 addrspace(5)* %rng_offset.i.i, align 4
+  %rng_hash.i.i = getelementptr inbounds < 4 x i32>, ptr addrspace(5) %state, i32 0, i32 1
+  %tmp0 = load i32, ptr addrspace(5) %rng_hash.i.i, align 4
+  %rng_offset.i.i = getelementptr inbounds <4 x i32>, ptr addrspace(5) %state, i32 0, i32 2
+  %tmp1 = load i32, ptr addrspace(5) %rng_offset.i.i, align 4
   %add.i.i = add i32 %tmp1, %tmp0
   %add1.i.i = add i32 %add.i.i, 0
   %mul.i.i.i.i = mul i32 %add1.i.i, 1103515245
   %add.i.i.i.i = add i32 %mul.i.i.i.i, 12345
-  store i32 %add.i.i.i.i, i32 addrspace(5)* undef, align 16
+  store i32 %add.i.i.i.i, ptr addrspace(5) undef, align 16
   br label %shader_eval_surface.exit
 
 shader_eval_surface.exit:                         ; preds = %entry
   ret void
 }
 
-declare hidden i32 @svm_eval_nodes(float addrspace(5)*, <1339 x i32> addrspace(5)*, <4 x i32> addrspace(5)*, i32, i32) local_unnamed_addr #0
+declare hidden i32 @svm_eval_nodes(ptr addrspace(5), ptr addrspace(5), ptr addrspace(5), i32, i32) local_unnamed_addr #0
 
 attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

diff  --git a/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
index cee3dc60f0c8..e56226f4e48c 100644
--- a/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll
@@ -21,7 +21,7 @@
 define amdgpu_cs float @sub_zext_zext() {
 .entry:
 
-  %t519 = load float, float addrspace(3)* null
+  %t519 = load float, ptr addrspace(3) null
 
   %t524 = fcmp ogt float %t519, 0.000000e+00
   %t525 = fcmp olt float %t519, 0.000000e+00

diff  --git a/llvm/test/CodeGen/AMDGPU/swdev282079.ll b/llvm/test/CodeGen/AMDGPU/swdev282079.ll
index b8a0c8616193..5dcdb9cade83 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev282079.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev282079.ll
@@ -1,13 +1,13 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s
 
-define protected amdgpu_kernel void @foo(i64 addrspace(1)* %arg, i64 addrspace(1)* %arg1) {
+define protected amdgpu_kernel void @foo(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) {
 bb:
-  %tmp = addrspacecast i64* addrspace(5)* null to i64**
-  %tmp2 = call i64 @eggs(i64* undef) #1
-  %tmp3 = load i64*, i64** %tmp, align 8
-  %tmp4 = getelementptr inbounds i64, i64* %tmp3, i64 undef
-  store i64 %tmp2, i64* %tmp4, align 8
+  %tmp = addrspacecast ptr addrspace(5) null to ptr
+  %tmp2 = call i64 @eggs(ptr undef) #1
+  %tmp3 = load ptr, ptr %tmp, align 8
+  %tmp4 = getelementptr inbounds i64, ptr %tmp3, i64 undef
+  store i64 %tmp2, ptr %tmp4, align 8
   ret void
 }
 
-declare hidden i64 @eggs(i64*)
+declare hidden i64 @eggs(ptr)

diff  --git a/llvm/test/CodeGen/AMDGPU/tail-call-cgp.ll b/llvm/test/CodeGen/AMDGPU/tail-call-cgp.ll
index 6131ab2ae432..8292fdc78613 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-cgp.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-cgp.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -codegenprepare %s | FileCheck %s
 
-define internal fastcc void @callee(i32* nocapture %p, i32 %a) #0 {
-  store volatile i32 %a, i32* %p, align 4
+define internal fastcc void @callee(ptr nocapture %p, i32 %a) #0 {
+  store volatile i32 %a, ptr %p, align 4
   ret void
 }
 
@@ -9,13 +9,13 @@ define internal fastcc void @callee(i32* nocapture %p, i32 %a) #0 {
 ; CHECK: tail call fastcc void @callee(
 ; CHECK-NEXT: ret void
 ; CHECK: ret void
-define void @func_caller(i32* nocapture %p, i32 %a, i32 %b) #0 {
+define void @func_caller(ptr nocapture %p, i32 %a, i32 %b) #0 {
 entry:
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %bb, label %ret
 
 bb:
-  tail call fastcc void @callee(i32* %p, i32 %a)
+  tail call fastcc void @callee(ptr %p, i32 %a)
   br label %ret
 
 ret:
@@ -27,13 +27,13 @@ ret:
 ; CHECK-NEXT: br label %ret
 
 ; CHECK: ret void
-define amdgpu_kernel void @kernel_caller(i32* nocapture %p, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @kernel_caller(ptr nocapture %p, i32 %a, i32 %b) #0 {
 entry:
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %bb, label %ret
 
 bb:
-  tail call fastcc void @callee(i32* %p, i32 %a)
+  tail call fastcc void @callee(ptr %p, i32 %a)
   br label %ret
 
 ret:

diff  --git a/llvm/test/CodeGen/AMDGPU/tail-duplication-convergent.ll b/llvm/test/CodeGen/AMDGPU/tail-duplication-convergent.ll
index 75f85f073bf2..17ec27bc49db 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-duplication-convergent.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-duplication-convergent.ll
@@ -15,16 +15,16 @@ declare void @llvm.amdgcn.ds.gws.sema.release.all(i32 %offset) #2
 ; GCN-LABEL: {{^}}taildup_barrier:
 ; GCN: s_barrier
 ; GCN-NOT: s_barrier
-define void @taildup_barrier(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond) #0 {
+define void @taildup_barrier(ptr addrspace(1) %a, ptr addrspace(1) %b, i1 %cond) #0 {
 entry:
   br i1 %cond, label %bb1, label %bb2
 
 bb1:
-  store i32 0, i32 addrspace(1)* %a
+  store i32 0, ptr addrspace(1) %a
   br label %call
 
 bb2:
-  store i32 1, i32 addrspace(1)* %a
+  store i32 1, ptr addrspace(1) %a
   br label %call
 
 call:
@@ -38,16 +38,16 @@ ret:
 ; GCN-LABEL: {{^}}taildup_convergent_call:
 ; GCN: s_swappc_b64
 ; GCN-NOT: s_swappc_b64
-define void @taildup_convergent_call(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond) #1 {
+define void @taildup_convergent_call(ptr addrspace(1) %a, ptr addrspace(1) %b, i1 %cond) #1 {
 entry:
   br i1 %cond, label %bb1, label %bb2
 
 bb1:
-  store i32 0, i32 addrspace(1)* %a
+  store i32 0, ptr addrspace(1) %a
   br label %call
 
 bb2:
-  store i32 1, i32 addrspace(1)* %a
+  store i32 1, ptr addrspace(1) %a
   br label %call
 
 call:
@@ -63,16 +63,16 @@ ret:
 ; GCN-LABEL: {{^}}taildup_nonconvergent_call:
 ; GCN: s_swappc_b64
 ; GCN-NOT: s_swappc_b64
-define void @taildup_nonconvergent_call(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond) #1 {
+define void @taildup_nonconvergent_call(ptr addrspace(1) %a, ptr addrspace(1) %b, i1 %cond) #1 {
 entry:
   br i1 %cond, label %bb1, label %bb2
 
 bb1:
-  store i32 0, i32 addrspace(1)* %a
+  store i32 0, ptr addrspace(1) %a
   br label %call
 
 bb2:
-  store i32 1, i32 addrspace(1)* %a
+  store i32 1, ptr addrspace(1) %a
   br label %call
 
 call:
@@ -86,16 +86,16 @@ ret:
 ; GCN-LABEL: {{^}}taildup_convergent_tailcall:
 ; GCN: s_setpc_b64
 ; GCN-NOT: s_setpc_b64
-define void @taildup_convergent_tailcall(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond) #1 {
+define void @taildup_convergent_tailcall(ptr addrspace(1) %a, ptr addrspace(1) %b, i1 %cond) #1 {
 entry:
   br i1 %cond, label %bb1, label %bb2
 
 bb1:
-  store i32 0, i32 addrspace(1)* %a
+  store i32 0, ptr addrspace(1) %a
   br label %call
 
 bb2:
-  store i32 1, i32 addrspace(1)* %a
+  store i32 1, ptr addrspace(1) %a
   br label %call
 
 call:
@@ -106,16 +106,16 @@ call:
 ; GCN-LABEL: {{^}}taildup_gws_init:
 ; GCN: ds_gws_init
 ; GCN-NOT: ds_gws_init
-define amdgpu_kernel void @taildup_gws_init(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
+define amdgpu_kernel void @taildup_gws_init(ptr addrspace(1) %a, ptr addrspace(1) %b, i1 %cond, i32 %val, i32 %offset) #0 {
 entry:
   br i1 %cond, label %bb1, label %bb2
 
 bb1:
-  store i32 0, i32 addrspace(1)* %a
+  store i32 0, ptr addrspace(1) %a
   br label %call
 
 bb2:
-  store i32 1, i32 addrspace(1)* %a
+  store i32 1, ptr addrspace(1) %a
   br label %call
 
 call:
@@ -129,16 +129,16 @@ ret:
 ; GCN-LABEL: {{^}}taildup_gws_barrier:
 ; GCN: ds_gws_barrier
 ; GCN-NOT: ds_gws_barrier
-define amdgpu_kernel void @taildup_gws_barrier(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
+define amdgpu_kernel void @taildup_gws_barrier(ptr addrspace(1) %a, ptr addrspace(1) %b, i1 %cond, i32 %val, i32 %offset) #0 {
 entry:
   br i1 %cond, label %bb1, label %bb2
 
 bb1:
-  store i32 0, i32 addrspace(1)* %a
+  store i32 0, ptr addrspace(1) %a
   br label %call
 
 bb2:
-  store i32 1, i32 addrspace(1)* %a
+  store i32 1, ptr addrspace(1) %a
   br label %call
 
 call:
@@ -152,16 +152,16 @@ ret:
 ; GCN-LABEL: {{^}}taildup_gws_sema_release_all:
 ; GCN: ds_gws_sema_release_all
 ; GCN-NOT: ds_gws
-define amdgpu_kernel void @taildup_gws_sema_release_all(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %offset) #0 {
+define amdgpu_kernel void @taildup_gws_sema_release_all(ptr addrspace(1) %a, ptr addrspace(1) %b, i1 %cond, i32 %offset) #0 {
 entry:
   br i1 %cond, label %bb1, label %bb2
 
 bb1:
-  store i32 0, i32 addrspace(1)* %a
+  store i32 0, ptr addrspace(1) %a
   br label %call
 
 bb2:
-  store i32 1, i32 addrspace(1)* %a
+  store i32 1, ptr addrspace(1) %a
   br label %call
 
 call:

diff  --git a/llvm/test/CodeGen/AMDGPU/target-mem-intrinsic-metadata.ll b/llvm/test/CodeGen/AMDGPU/target-mem-intrinsic-metadata.ll
index 9c6c8beed669..41072a5e8c6e 100644
--- a/llvm/test/CodeGen/AMDGPU/target-mem-intrinsic-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/target-mem-intrinsic-metadata.ll
@@ -6,13 +6,13 @@
 ; MIR-LABEL: name: ds_append_noalias
 ; MIR: DS_APPEND {{.*}} :: (load store (s32) on %{{.*}}, !noalias !{{[0-9]+}}, addrspace 3)
 define amdgpu_kernel void @ds_append_noalias() {
-  %lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(1)* null
-  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false), !noalias !0
-  store i32 %val, i32 addrspace(1)* null, align 4
+  %lds = load ptr addrspace(3), ptr addrspace(1) null
+  %val = call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %lds, i1 false), !noalias !0
+  store i32 %val, ptr addrspace(1) null, align 4
   ret void
 }
 
-declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #0
+declare i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) nocapture, i1 immarg) #0
 
 attributes #0 = { argmemonly convergent nounwind willreturn }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index ee4978b97d5a..caf67da9cd2d 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -25,7 +25,7 @@ define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float
   ; GCN: bb.2.else:
   ; GCN:   successors:
   ; GCN:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN:   S_WAITCNT 3952
   ; GCN: bb.3:
 entry:
@@ -34,7 +34,7 @@ entry:
 if:                                               ; preds = %entry
   ret float %b
 else:                                             ; preds = %entry
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   unreachable
 }
 
@@ -62,7 +62,7 @@ define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a,
   ; GCN: bb.4.else:
   ; GCN:   successors:
   ; GCN:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN:   S_WAITCNT 3952
   ; GCN: bb.5:
 entry:
@@ -76,7 +76,7 @@ else.if.cond:                                     ; preds = %entry
 else.if:                                          ; preds = %else.if.cond
   ret float %d
 else:                                             ; preds = %else.if.cond
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   unreachable
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index a64507ca5861..8fddc89d2cec 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -15,7 +15,7 @@
 declare void @llvm.trap() #0
 declare void @llvm.debugtrap() #1
 
-define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
+define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; NOHSA-TRAP-GFX900-V2-LABEL: trap:
 ; NOHSA-TRAP-GFX900-V2:       ; %bb.0:
 ; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
@@ -347,14 +347,14 @@ define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
 ; HSA-NOTRAP-GFX900-V4-NEXT:    global_store_dword v0, v1, s[0:1]
 ; HSA-NOTRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NOTRAP-GFX900-V4-NEXT:    s_endpgm
-  store volatile i32 1, i32 addrspace(1)* %arg0
+  store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.trap()
   unreachable
-  store volatile i32 2, i32 addrspace(1)* %arg0
+  store volatile i32 2, ptr addrspace(1) %arg0
   ret void
 }
 
-define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %arg0) local_unnamed_addr {
+define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
 ; NOHSA-TRAP-GFX900-V2-LABEL: non_entry_trap:
 ; NOHSA-TRAP-GFX900-V2:       ; %bb.0: ; %entry
 ; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
@@ -777,7 +777,7 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
 ; HSA-NOTRAP-GFX900-V4-NEXT:  .LBB1_2: ; %trap
 ; HSA-NOTRAP-GFX900-V4-NEXT:    s_endpgm
 entry:
-  %tmp29 = load volatile i32, i32 addrspace(1)* %arg0
+  %tmp29 = load volatile i32, ptr addrspace(1) %arg0
   %cmp = icmp eq i32 %tmp29, -1
   br i1 %cmp, label %ret, label %trap
 
@@ -786,11 +786,11 @@ trap:
   unreachable
 
 ret:
-  store volatile i32 3, i32 addrspace(1)* %arg0
+  store volatile i32 3, ptr addrspace(1) %arg0
   ret void
 }
 
-define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0) {
+define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
 ; NOHSA-TRAP-GFX900-V2-LABEL: debugtrap:
 ; NOHSA-TRAP-GFX900-V2:       ; %bb.0:
 ; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1159,9 +1159,9 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
 ; HSA-NOTRAP-GFX900-V4-NEXT:    global_store_dword v0, v2, s[0:1]
 ; HSA-NOTRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NOTRAP-GFX900-V4-NEXT:    s_endpgm
-  store volatile i32 1, i32 addrspace(1)* %arg0
+  store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.debugtrap()
-  store volatile i32 2, i32 addrspace(1)* %arg0
+  store volatile i32 2, ptr addrspace(1) %arg0
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll
index 77b8b47d1f0b..9489b6beec64 100644
--- a/llvm/test/CodeGen/AMDGPU/trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap.ll
@@ -23,7 +23,7 @@
 ; RUN: llc -global-isel=0 -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
 ; RUN: llc -global-isel=1 -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
 
-; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (i32 addrspace(1)*): debugtrap handler not supported
+; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (ptr addrspace(1)): debugtrap handler not supported
 
 
 declare void @llvm.trap() #0
@@ -50,11 +50,11 @@ declare void @llvm.debugtrap() #1
 ; TRAP-BIT: enable_trap_handler = 1
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-MESA-TRAP: s_endpgm
-define amdgpu_kernel void @hsa_trap(i32 addrspace(1)* nocapture readonly %arg0) {
-  store volatile i32 1, i32 addrspace(1)* %arg0
+define amdgpu_kernel void @hsa_trap(ptr addrspace(1) nocapture readonly %arg0) {
+  store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.trap()
   unreachable
-  store volatile i32 2, i32 addrspace(1)* %arg0
+  store volatile i32 2, ptr addrspace(1) %arg0
   ret void
 }
 
@@ -78,10 +78,10 @@ define amdgpu_kernel void @hsa_trap(i32 addrspace(1)* nocapture readonly %arg0)
 ; TRAP-BIT: enable_trap_handler = 1
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-MESA-TRAP: s_endpgm
-define amdgpu_kernel void @hsa_debugtrap(i32 addrspace(1)* nocapture readonly %arg0) {
-  store volatile i32 1, i32 addrspace(1)* %arg0
+define amdgpu_kernel void @hsa_debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
+  store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.debugtrap()
-  store volatile i32 2, i32 addrspace(1)* %arg0
+  store volatile i32 2, ptr addrspace(1) %arg0
   ret void
 }
 
@@ -91,11 +91,11 @@ define amdgpu_kernel void @hsa_debugtrap(i32 addrspace(1)* nocapture readonly %a
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-HSA-TRAP: s_endpgm
 ; NO-MESA-TRAP: s_endpgm
-define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
-  store volatile i32 1, i32 addrspace(1)* %arg0
+define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
+  store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.trap()
   unreachable
-  store volatile i32 2, i32 addrspace(1)* %arg0
+  store volatile i32 2, ptr addrspace(1) %arg0
   ret void
 }
 
@@ -106,9 +106,9 @@ define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
 ; HSA-TRAP: BB{{[0-9]_[0-9]+}}: ; %trap
 ; HSA-TRAP: s_mov_b64 s[0:1], s[4:5]
 ; HSA-TRAP-NEXT: s_trap 2
-define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %arg0) local_unnamed_addr {
+define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
 entry:
-  %tmp29 = load volatile i32, i32 addrspace(1)* %arg0
+  %tmp29 = load volatile i32, ptr addrspace(1) %arg0
   %cmp = icmp eq i32 %tmp29, -1
   br i1 %cmp, label %ret, label %trap
 
@@ -117,7 +117,7 @@ trap:
   unreachable
 
 ret:
-  store volatile i32 3, i32 addrspace(1)* %arg0
+  store volatile i32 3, ptr addrspace(1) %arg0
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll b/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
index 6f9f9fc6837f..f8d27fc22d56 100644
--- a/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
+++ b/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
@@ -16,10 +16,10 @@
 ; loop to not be unrolled at all, but that may change in the future.
 
 ; CHECK-LABEL: @test
-; CHECK: store i8 0, i8 addrspace(1)*
-; CHECK-NOT: store i8 0, i8 addrspace(1)*
+; CHECK: store i8 0, ptr addrspace(1)
+; CHECK-NOT: store i8 0, ptr addrspace(1)
 ; CHECK: ret void
-define amdgpu_kernel void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @test(ptr addrspace(1) nocapture %dst, i32 %a, i32 %b, i32 %c) {
 entry:
   %add = add nsw i32 %b, 4
   %cmp = icmp sgt i32 %add, %a
@@ -39,8 +39,8 @@ if.then4:                                         ; preds = %if.then4.lr.ph, %if
   %add2 = add nsw i32 %b.addr.014, 1
   %1 = sext i32 %b.addr.014 to i64
   %add.ptr.sum = add nsw i64 %1, %0
-  %add.ptr5 = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %add.ptr.sum
-  store i8 0, i8 addrspace(1)* %add.ptr5, align 1
+  %add.ptr5 = getelementptr inbounds i8, ptr addrspace(1) %dst, i64 %add.ptr.sum
+  store i8 0, ptr addrspace(1) %add.ptr5, align 1
   %inc = add nsw i32 %i.015, 1
   %cmp1 = icmp slt i32 %inc, 4
   %cmp3 = icmp slt i32 %add2, %a

diff  --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index e832dc7614d3..4978273ceb2e 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -stop-after twoaddressinstruction < %s | FileCheck %s
 
 ; Check that %16 gets constrained to register class sgpr_96_with_sub0_sub1.
-define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(<3 x i32> addrspace(4)* inreg %ptr) {
+define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
   ; CHECK-LABEL: name: s_load_constant_v3i32_align4
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $sgpr0, $sgpr1
@@ -20,6 +20,6 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(<3 x i32> addrspace(4)*
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
   ; CHECK-NEXT:   $sgpr2 = COPY killed [[COPY4]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit killed $sgpr0, implicit killed $sgpr1, implicit killed $sgpr2
-  %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 4
+  %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
   ret <3 x i32> %load
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index 1c76a1f0340b..f93e7579028b 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -9,13 +9,13 @@
 
 ; EG: ADDC_UINT
 ; EG: ADDC_UINT
-define amdgpu_kernel void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
   %ext = zext i1 %carry to i64
   %add2 = add i64 %val, %ext
-  store i64 %add2, i64 addrspace(1)* %out, align 8
+  store i64 %add2, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -30,12 +30,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define amdgpu_kernel void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -48,18 +48,18 @@ define amdgpu_kernel void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
-  %a = load i32, i32 addrspace(1)* %a.gep, align 4
-  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
+  %a = load i32, ptr addrspace(1) %a.gep, align 4
+  %b = load i32, ptr addrspace(1) %b.gep, align 4
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -72,19 +72,19 @@ define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define amdgpu_kernel void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
-  %a = load i32, i32 addrspace(1)* %a.gep, align 4
-  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
+  %a = load i32, ptr addrspace(1) %a.gep, align 4
+  %b = load i32, ptr addrspace(1) %b.gep, align 4
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
-  store volatile i32 %val, i32 addrspace(1)* %out, align 4
+  store volatile i32 %val, ptr addrspace(1) %out, align 4
   call void asm sideeffect "", "~{vcc}"() #0
-  store volatile i1 %carry, i1 addrspace(1)* %carryout
+  store volatile i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -94,12 +94,12 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspac
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define amdgpu_kernel void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out, align 8
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -115,18 +115,18 @@ define amdgpu_kernel void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define amdgpu_kernel void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
-  %b.gep = getelementptr inbounds i64, i64 addrspace(1)* %b.ptr
-  %a = load i64, i64 addrspace(1)* %a.gep
-  %b = load i64, i64 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
+  %b.gep = getelementptr inbounds i64, ptr addrspace(1) %b.ptr
+  %a = load i64, ptr addrspace(1) %a.gep
+  %b = load i64, ptr addrspace(1) %b.gep
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
-  store i64 %val, i64 addrspace(1)* %out
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -136,18 +136,18 @@ define amdgpu_kernel void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 
 ; GFX9: v_add_u16_e32
 ; GFX9: v_cmp_lt_u16_e32
-define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %uadd = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %a, i16 %b)
   %val = extractvalue { i16, i1 } %uadd, 0
   %carry = extractvalue { i16, i1 } %uadd, 1
-  store i16 %val, i16 addrspace(1)* %out
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i16 %val, ptr addrspace(1) %out
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -158,22 +158,22 @@ define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)*
 ; SICIVI: v_cmp_lt_i32
 ; SICIVI: v_cmp_lt_i32
 ; SICIVI: v_add_{{[iu]}}32
-define amdgpu_kernel void @v_uaddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
-  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
-  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+  %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
+  %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
   %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
   %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
   %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
-  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  store <2 x i32> %val, ptr addrspace(1) %out, align 4
   %carry.ext = zext <2 x i1> %carry to <2 x i32>
-  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+  store <2 x i32> %carry.ext, ptr addrspace(1) %carryout
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_uaddo_clamp_bit:
 ; GCN: v_add_{{i|u|co_u}}32_e32
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_uaddo_clamp_bit(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
 entry:
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
@@ -187,22 +187,22 @@ if:
 
 exit:
   %cout = phi i1 [false, %entry], [%c2, %if]
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %cout, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %cout, ptr addrspace(1) %carryout
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_uaddo_clamp_bit:
 ; GCN: v_add_{{i|u|co_u}}32_e64
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_uaddo_clamp_bit(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
-  %a = load i32, i32 addrspace(1)* %a.gep
-  %b = load i32, i32 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
+  %a = load i32, ptr addrspace(1) %a.gep
+  %b = load i32, ptr addrspace(1) %b.gep
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -215,8 +215,8 @@ if:
 
 exit:
   %cout = phi i1 [false, %entry], [%c2, %if]
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %cout, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %cout, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -225,11 +225,11 @@ exit:
 ; GCN: v_addc
 ; GCN: v_addc
 ; GCN: v_addc
-define amdgpu_cs void @sv_uaddo_i128(i32 addrspace(1)* %out, i128 inreg %a, i128 %b) {
+define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 %b) {
   %uadd = call { i128, i1 } @llvm.uadd.with.overflow.i128(i128 %a, i128 %b)
   %carry = extractvalue { i128, i1 } %uadd, 1
   %carry.ext = zext i1 %carry to i32
-  store i32 %carry.ext, i32 addrspace(1)* %out
+  store i32 %carry.ext, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
index bee04e3c76f9..f82b468542e7 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
@@ -12,10 +12,10 @@
 define void @func1() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@func1
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    store i32 0, i32* @x, align 4
+; CHECK-NEXT:    store i32 0, ptr @x, align 4
 ; CHECK-NEXT:    ret void
 ;
-  store i32 0, i32* @x
+  store i32 0, ptr @x
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
index 1a0369be98c1..8a9408711890 100644
--- a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -16,6 +16,6 @@ target datalayout = "A5"
 ; R600: MOV
 define amdgpu_kernel void @foo() {
   %alloca = alloca i32, align 4, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/unroll.ll b/llvm/test/CodeGen/AMDGPU/unroll.ll
index 582ffa5ccab3..41bb4bb095c2 100644
--- a/llvm/test/CodeGen/AMDGPU/unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/unroll.ll
@@ -9,8 +9,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 
 ; CHECK-LABEL: @private_memory
 ; CHECK-NOT: alloca
-; CHECK: store i32 5, i32 addrspace(1)* %out
-define amdgpu_kernel void @private_memory(i32 addrspace(1)* %out) {
+; CHECK: store i32 5, ptr addrspace(1) %out
+define amdgpu_kernel void @private_memory(ptr addrspace(1) %out) {
 entry:
   %0 = alloca [32 x i32], addrspace(5)
   br label %loop.header
@@ -20,8 +20,8 @@ loop.header:
   br label %loop.body
 
 loop.body:
-  %ptr = getelementptr [32 x i32], [32 x i32] addrspace(5)* %0, i32 0, i32 %counter
-  store i32 %counter, i32 addrspace(5)* %ptr
+  %ptr = getelementptr [32 x i32], ptr addrspace(5) %0, i32 0, i32 %counter
+  store i32 %counter, ptr addrspace(5) %ptr
   br label %loop.inc
 
 loop.inc:
@@ -30,19 +30,19 @@ loop.inc:
   br i1 %1, label  %exit, label %loop.header
 
 exit:
-  %2 = getelementptr [32 x i32], [32 x i32] addrspace(5)* %0, i32 0, i32 5
-  %3 = load i32, i32 addrspace(5)* %2
-  store i32 %3, i32 addrspace(1)* %out
+  %2 = getelementptr [32 x i32], ptr addrspace(5) %0, i32 0, i32 5
+  %3 = load i32, ptr addrspace(5) %2
+  store i32 %3, ptr addrspace(1) %out
   ret void
 }
 
 ; Check that loop is unrolled for local memory references
 
 ; CHECK-LABEL: @local_memory
-; CHECK: getelementptr i32, i32 addrspace(1)* %out, i32 128
+; CHECK: getelementptr i32, ptr addrspace(1) %out, i32 128
 ; CHECK-NEXT: store
 ; CHECK-NEXT: ret
-define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) {
+define amdgpu_kernel void @local_memory(ptr addrspace(1) %out, ptr addrspace(3) %lds) {
 entry:
   br label %loop.header
 
@@ -51,10 +51,10 @@ loop.header:
   br label %loop.body
 
 loop.body:
-  %ptr_lds = getelementptr i32, i32 addrspace(3)* %lds, i32 %counter
-  %val = load i32, i32 addrspace(3)* %ptr_lds
-  %ptr_out = getelementptr i32, i32 addrspace(1)* %out, i32 %counter
-  store i32 %val, i32 addrspace(1)* %ptr_out
+  %ptr_lds = getelementptr i32, ptr addrspace(3) %lds, i32 %counter
+  %val = load i32, ptr addrspace(3) %ptr_lds
+  %ptr_out = getelementptr i32, ptr addrspace(1) %out, i32 %counter
+  store i32 %val, ptr addrspace(1) %ptr_out
   br label %loop.inc
 
 loop.inc:
@@ -75,7 +75,7 @@ exit:
 ; CHECK-NEXT: getelementptr
 ; CHECK-NEXT: store
 ; CHECK-NOT: br
-define amdgpu_kernel void @unroll_for_if(i32 addrspace(5)* %a) {
+define amdgpu_kernel void @unroll_for_if(ptr addrspace(5) %a) {
 entry:
   br label %for.body
 
@@ -86,8 +86,8 @@ for.body:                                         ; preds = %entry, %for.inc
 
 if.then:                                          ; preds = %for.body
   %0 = sext i32 %i1 to i64
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(5)* %a, i64 %0
-  store i32 0, i32 addrspace(5)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(5) %a, i64 %0
+  store i32 0, ptr addrspace(5) %arrayidx, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body, %if.then
@@ -103,13 +103,13 @@ for.end:                                          ; preds = %for.cond
 
 ; CHECK-LABEL: @local_memory_runtime
 ; CHECK: loop.header:
-; CHECK: load i32, i32 addrspace(3)*
-; CHECK: load i32, i32 addrspace(3)*
+; CHECK: load i32, ptr addrspace(3)
+; CHECK: load i32, ptr addrspace(3)
 ; CHECK: br i1
 ; CHECK: loop.header.epil
-; CHECK: load i32, i32 addrspace(3)*
+; CHECK: load i32, ptr addrspace(3)
 ; CHECK: ret
-define amdgpu_kernel void @local_memory_runtime(i32 addrspace(1)* %out, i32 addrspace(3)* %lds, i32 %n) {
+define amdgpu_kernel void @local_memory_runtime(ptr addrspace(1) %out, ptr addrspace(3) %lds, i32 %n) {
 entry:
   br label %loop.header
 
@@ -118,10 +118,10 @@ loop.header:
   br label %loop.body
 
 loop.body:
-  %ptr_lds = getelementptr i32, i32 addrspace(3)* %lds, i32 %counter
-  %val = load i32, i32 addrspace(3)* %ptr_lds
-  %ptr_out = getelementptr i32, i32 addrspace(1)* %out, i32 %counter
-  store i32 %val, i32 addrspace(1)* %ptr_out
+  %ptr_lds = getelementptr i32, ptr addrspace(3) %lds, i32 %counter
+  %val = load i32, ptr addrspace(3) %ptr_lds
+  %ptr_out = getelementptr i32, ptr addrspace(1) %out, i32 %counter
+  store i32 %val, ptr addrspace(1) %ptr_out
   br label %loop.inc
 
 loop.inc:

diff  --git a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
index eeb54f927aaf..3a752f4a8d72 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
@@ -6,13 +6,13 @@ declare i32 @external_function(i32) nounwind
 
 ; GCN-NOT: error
 ; R600: in function test_call_external{{.*}}: unsupported call to function external_function
-define amdgpu_kernel void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+define amdgpu_kernel void @test_call_external(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %c = call i32 @external_function(i32 %b) nounwind
   %result = add i32 %a, %c
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -23,22 +23,22 @@ define i32 @defined_function(i32 %x) nounwind noinline {
 
 ; GCN-NOT: error
 ; R600: in function test_call{{.*}}: unsupported call to function defined_function
-define amdgpu_kernel void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+define amdgpu_kernel void @test_call(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %c = call i32 @defined_function(i32 %b) nounwind
   %result = add i32 %a, %c
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-; GCN: error: <unknown>:0:0: in function test_tail_call i32 (i32 addrspace(1)*, i32 addrspace(1)*): unsupported required tail call to function defined_function
+; GCN: error: <unknown>:0:0: in function test_tail_call i32 (ptr addrspace(1), ptr addrspace(1)): unsupported required tail call to function defined_function
 ; R600: in function test_tail_call{{.*}}: unsupported call to function defined_function
-define i32 @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
-  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1)* %in
-  %b = load i32, i32 addrspace(1)* %b_ptr
+define i32 @test_tail_call(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
+  %a = load i32, ptr addrspace(1) %in
+  %b = load i32, ptr addrspace(1) %b_ptr
   %c = tail call i32 @defined_function(i32 %b)
   ret i32 %c
 }
@@ -58,7 +58,7 @@ declare i32 @extern_variadic(...)
 ; R600: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to function extern_variadic
 define i32 @test_tail_call_bitcast_extern_variadic(<4 x float> %arg0, <4 x float> %arg1, i32 %arg2) {
   %add = fadd <4 x float> %arg0, %arg1
-  %call = tail call i32 bitcast (i32 (...)* @extern_variadic to i32 (<4 x float>)*)(<4 x float> %add)
+  %call = tail call i32 @extern_variadic(<4 x float> %add)
   ret i32 %call
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll b/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll
index 68e91e8c9c6b..1ea0bd0090e0 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -6,11 +6,11 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define amdgpu_kernel void @slt(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @slt(ptr addrspace(1) %out, i32 %in) {
 entry:
   %0 = icmp slt i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -18,11 +18,11 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define amdgpu_kernel void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ult_i32(ptr addrspace(1) %out, i32 %in) {
 entry:
   %0 = icmp ult i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -31,11 +31,11 @@ entry:
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define amdgpu_kernel void @ult_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float(ptr addrspace(1) %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
-  store float %1, float addrspace(1)* %out
+  store float %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -43,11 +43,11 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define amdgpu_kernel void @ult_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float_native(ptr addrspace(1) %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
-  store float %1, float addrspace(1)* %out
+  store float %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -55,11 +55,11 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define amdgpu_kernel void @olt(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @olt(ptr addrspace(1) %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
-  store float %1, float addrspace(1)* %out
+  store float %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -67,11 +67,11 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define amdgpu_kernel void @sle(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sle(ptr addrspace(1) %out, i32 %in) {
 entry:
   %0 = icmp sle i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -79,11 +79,11 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define amdgpu_kernel void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ule_i32(ptr addrspace(1) %out, i32 %in) {
 entry:
   %0 = icmp ule i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
-  store i32 %1, i32 addrspace(1)* %out
+  store i32 %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -92,11 +92,11 @@ entry:
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define amdgpu_kernel void @ule_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float(ptr addrspace(1) %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
-  store float %1, float addrspace(1)* %out
+  store float %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -104,11 +104,11 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define amdgpu_kernel void @ule_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float_native(ptr addrspace(1) %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
-  store float %1, float addrspace(1)* %out
+  store float %1, ptr addrspace(1) %out
   ret void
 }
 
@@ -116,10 +116,10 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT:1084227584(5.000000e+00)
-define amdgpu_kernel void @ole(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ole(ptr addrspace(1) %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
-  store float %1, float addrspace(1)* %out
+  store float %1, ptr addrspace(1) %out
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
index 6c5669ed2a1f..2d86a3540d7c 100644
--- a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
@@ -40,20 +40,15 @@
 @k0.lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
 
 define amdgpu_kernel void @k0() {
-  %k0.lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @k0.lds.size.1.align.1 to i8 addrspace(3)*
-   store i8 1, i8 addrspace(3)* %k0.lds.size.1.align.1.bc, align 1
+   store i8 1, ptr addrspace(3) @k0.lds.size.1.align.1, align 1
 
-  %k0.lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @k0.lds.size.2.align.2 to i8 addrspace(3)*
-   store i8 2, i8 addrspace(3)* %k0.lds.size.2.align.2.bc, align 2
+   store i8 2, ptr addrspace(3) @k0.lds.size.2.align.2, align 2
 
-  %k0.lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @k0.lds.size.4.align.4 to i8 addrspace(3)*
-   store i8 3, i8 addrspace(3)* %k0.lds.size.4.align.4.bc, align 4
+   store i8 3, ptr addrspace(3) @k0.lds.size.4.align.4, align 4
 
-  %k0.lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @k0.lds.size.8.align.8 to i8 addrspace(3)*
-   store i8 4, i8 addrspace(3)* %k0.lds.size.8.align.8.bc, align 8
+   store i8 4, ptr addrspace(3) @k0.lds.size.8.align.8, align 8
 
-  %k0.lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @k0.lds.size.16.align.16 to i8 addrspace(3)*
-   store i8 5, i8 addrspace(3)* %k0.lds.size.16.align.16.bc, align 16
+   store i8 5, ptr addrspace(3) @k0.lds.size.16.align.16, align 16
 
   ret void
 }
@@ -71,20 +66,15 @@ define amdgpu_kernel void @k0() {
 @k1.lds.size.1.align.16 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 16
 
 define amdgpu_kernel void @k1() {
-  %k1.lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @k1.lds.size.1.align.1 to i8 addrspace(3)*
-   store i8 1, i8 addrspace(3)* %k1.lds.size.1.align.1.bc, align 1
+   store i8 1, ptr addrspace(3) @k1.lds.size.1.align.1, align 1
 
-  %k1.lds.size.1.align.2.bc = bitcast [1 x i8] addrspace(3)* @k1.lds.size.1.align.2 to i8 addrspace(3)*
-   store i8 2, i8 addrspace(3)* %k1.lds.size.1.align.2.bc, align 2
+   store i8 2, ptr addrspace(3) @k1.lds.size.1.align.2, align 2
 
-  %k1.lds.size.1.align.4.bc = bitcast [1 x i8] addrspace(3)* @k1.lds.size.1.align.4 to i8 addrspace(3)*
-   store i8 3, i8 addrspace(3)* %k1.lds.size.1.align.4.bc, align 4
+   store i8 3, ptr addrspace(3) @k1.lds.size.1.align.4, align 4
 
-  %k1.lds.size.1.align.8.bc = bitcast [1 x i8] addrspace(3)* @k1.lds.size.1.align.8 to i8 addrspace(3)*
-   store i8 4, i8 addrspace(3)* %k1.lds.size.1.align.8.bc, align 8
+   store i8 4, ptr addrspace(3) @k1.lds.size.1.align.8, align 8
 
-  %k1.lds.size.1.align.16.bc = bitcast [1 x i8] addrspace(3)* @k1.lds.size.1.align.16 to i8 addrspace(3)*
-   store i8 5, i8 addrspace(3)* %k1.lds.size.1.align.16.bc, align 16
+   store i8 5, ptr addrspace(3) @k1.lds.size.1.align.16, align 16
 
   ret void
 }
@@ -100,17 +90,13 @@ define amdgpu_kernel void @k1() {
 @k2.lds.size.9.align.8 = internal unnamed_addr addrspace(3) global [9 x i8] undef, align 8
 
 define amdgpu_kernel void @k2() {
-  %k2.lds.size.2.align.1.bc = bitcast [2 x i8] addrspace(3)* @k2.lds.size.2.align.1 to i8 addrspace(3)*
-   store i8 1, i8 addrspace(3)* %k2.lds.size.2.align.1.bc, align 1
+   store i8 1, ptr addrspace(3) @k2.lds.size.2.align.1, align 1
 
-  %k2.lds.size.3.align.2.bc = bitcast [3 x i8] addrspace(3)* @k2.lds.size.3.align.2 to i8 addrspace(3)*
-   store i8 2, i8 addrspace(3)* %k2.lds.size.3.align.2.bc, align 2
+   store i8 2, ptr addrspace(3) @k2.lds.size.3.align.2, align 2
 
-  %k2.lds.size.5.align.4.bc = bitcast [5 x i8] addrspace(3)* @k2.lds.size.5.align.4 to i8 addrspace(3)*
-   store i8 3, i8 addrspace(3)* %k2.lds.size.5.align.4.bc, align 4
+   store i8 3, ptr addrspace(3) @k2.lds.size.5.align.4, align 4
 
-  %k2.lds.size.9.align.8.bc = bitcast [9 x i8] addrspace(3)* @k2.lds.size.9.align.8 to i8 addrspace(3)*
-   store i8 4, i8 addrspace(3)* %k2.lds.size.9.align.8.bc, align 8
+   store i8 4, ptr addrspace(3) @k2.lds.size.9.align.8, align 8
 
   ret void
 }
@@ -126,17 +112,13 @@ define amdgpu_kernel void @k2() {
 @k3.lds.size.7.align.4 = internal unnamed_addr addrspace(3) global [7 x i8] undef, align 4
 
 define amdgpu_kernel void @k3() {
-  %k3.lds.size.5.align.2.bc = bitcast [5 x i8] addrspace(3)* @k3.lds.size.5.align.2 to i8 addrspace(3)*
-   store i8 1, i8 addrspace(3)* %k3.lds.size.5.align.2.bc, align 2
+   store i8 1, ptr addrspace(3) @k3.lds.size.5.align.2, align 2
 
-  %k3.lds.size.6.align.2.bc = bitcast [6 x i8] addrspace(3)* @k3.lds.size.6.align.2 to i8 addrspace(3)*
-   store i8 2, i8 addrspace(3)* %k3.lds.size.6.align.2.bc, align 2
+   store i8 2, ptr addrspace(3) @k3.lds.size.6.align.2, align 2
 
-  %k3.lds.size.7.align.2.bc = bitcast [7 x i8] addrspace(3)* @k3.lds.size.7.align.2 to i8 addrspace(3)*
-   store i8 3, i8 addrspace(3)* %k3.lds.size.7.align.2.bc, align 2
+   store i8 3, ptr addrspace(3) @k3.lds.size.7.align.2, align 2
 
-  %k3.lds.size.7.align.4.bc = bitcast [7 x i8] addrspace(3)* @k3.lds.size.7.align.4 to i8 addrspace(3)*
-   store i8 4, i8 addrspace(3)* %k3.lds.size.7.align.4.bc, align 4
+   store i8 4, ptr addrspace(3) @k3.lds.size.7.align.4, align 4
 
   ret void
 }
@@ -152,17 +134,13 @@ define amdgpu_kernel void @k3() {
 @k4.lds.size.12.align.8 = internal unnamed_addr addrspace(3) global [12 x i8] undef, align 8
 
 define amdgpu_kernel void @k4() {
-  %k4.lds.size.9.align.1.bc = bitcast [9 x i8] addrspace(3)* @k4.lds.size.9.align.1 to i8 addrspace(3)*
-   store i8 1, i8 addrspace(3)* %k4.lds.size.9.align.1.bc, align 1
+   store i8 1, ptr addrspace(3) @k4.lds.size.9.align.1, align 1
 
-  %k4.lds.size.10.align.2.bc = bitcast [10 x i8] addrspace(3)* @k4.lds.size.10.align.2 to i8 addrspace(3)*
-   store i8 2, i8 addrspace(3)* %k4.lds.size.10.align.2.bc, align 2
+   store i8 2, ptr addrspace(3) @k4.lds.size.10.align.2, align 2
 
-  %k4.lds.size.11.align.4.bc = bitcast [11 x i8] addrspace(3)* @k4.lds.size.11.align.4 to i8 addrspace(3)*
-   store i8 3, i8 addrspace(3)* %k4.lds.size.11.align.4.bc, align 4
+   store i8 3, ptr addrspace(3) @k4.lds.size.11.align.4, align 4
 
-  %k4.lds.size.12.align.8.bc = bitcast [12 x i8] addrspace(3)* @k4.lds.size.12.align.8 to i8 addrspace(3)*
-   store i8 4, i8 addrspace(3)* %k4.lds.size.12.align.8.bc, align 8
+   store i8 4, ptr addrspace(3) @k4.lds.size.12.align.8, align 8
 
   ret void
 }
@@ -177,17 +155,13 @@ define amdgpu_kernel void @k4() {
 @k5.lds.size.20.align.16 = internal unnamed_addr addrspace(3) global [20 x i8] undef, align 16
 
 define amdgpu_kernel void @k5() {
-  %k5.lds.size.17.align.16.bc = bitcast [17 x i8] addrspace(3)* @k5.lds.size.17.align.16 to i8 addrspace(3)*
-   store i8 1, i8 addrspace(3)* %k5.lds.size.17.align.16.bc, align 16
+   store i8 1, ptr addrspace(3) @k5.lds.size.17.align.16, align 16
 
-  %k5.lds.size.18.align.16.bc = bitcast [18 x i8] addrspace(3)* @k5.lds.size.18.align.16 to i8 addrspace(3)*
-   store i8 2, i8 addrspace(3)* %k5.lds.size.18.align.16.bc, align 16
+   store i8 2, ptr addrspace(3) @k5.lds.size.18.align.16, align 16
 
-  %k5.lds.size.19.align.16.bc = bitcast [19 x i8] addrspace(3)* @k5.lds.size.19.align.16 to i8 addrspace(3)*
-   store i8 3, i8 addrspace(3)* %k5.lds.size.19.align.16.bc, align 16
+   store i8 3, ptr addrspace(3) @k5.lds.size.19.align.16, align 16
 
-  %k5.lds.size.20.align.16.bc = bitcast [20 x i8] addrspace(3)* @k5.lds.size.20.align.16 to i8 addrspace(3)*
-   store i8 4, i8 addrspace(3)* %k5.lds.size.20.align.16.bc, align 16
+   store i8 4, ptr addrspace(3) @k5.lds.size.20.align.16, align 16
 
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index bc631d8ca762..44b344ede058 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -10,13 +10,13 @@
 
 ; EG: SUBB_UINT
 ; EG: ADDC_UINT
-define amdgpu_kernel void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
   %ext = zext i1 %carry to i64
   %add2 = add i64 %val, %ext
-  store i64 %add2, i64 addrspace(1)* %out, align 8
+  store i64 %add2, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -31,12 +31,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define amdgpu_kernel void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -49,18 +49,18 @@ define amdgpu_kernel void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
-  %a = load i32, i32 addrspace(1)* %a.gep, align 4
-  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
+  %a = load i32, ptr addrspace(1) %a.gep, align 4
+  %b = load i32, ptr addrspace(1) %b.gep, align 4
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -73,19 +73,19 @@ define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define amdgpu_kernel void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
-  %a = load i32, i32 addrspace(1)* %a.gep, align 4
-  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
+  %a = load i32, ptr addrspace(1) %a.gep, align 4
+  %b = load i32, ptr addrspace(1) %b.gep, align 4
   %uadd = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
-  store volatile i32 %val, i32 addrspace(1)* %out, align 4
+  store volatile i32 %val, ptr addrspace(1) %out, align 4
   call void asm sideeffect "", "~{vcc}"() #0
-  store volatile i1 %carry, i1 addrspace(1)* %carryout
+  store volatile i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -97,12 +97,12 @@ define amdgpu_kernel void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspac
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define amdgpu_kernel void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out, align 8
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -119,18 +119,18 @@ define amdgpu_kernel void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
-  %b.gep = getelementptr inbounds i64, i64 addrspace(1)* %b.ptr
-  %a = load i64, i64 addrspace(1)* %a.gep
-  %b = load i64, i64 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
+  %b.gep = getelementptr inbounds i64, ptr addrspace(1) %b.ptr
+  %a = load i64, ptr addrspace(1) %a.gep
+  %b = load i64, ptr addrspace(1) %b.gep
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
-  store i64 %val, i64 addrspace(1)* %out, align 8
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i64 %val, ptr addrspace(1) %out, align 8
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -144,18 +144,18 @@ define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 
 ; GFX9: v_sub_u16_e32
 ; GFX9: v_cmp_gt_u16_e32
-define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr
-  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %b.ptr
-  %a = load i16, i16 addrspace(1)* %a.gep
-  %b = load i16, i16 addrspace(1)* %b.gep
+  %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
+  %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr
+  %a = load i16, ptr addrspace(1) %a.gep
+  %b = load i16, ptr addrspace(1) %b.gep
   %usub = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 %a, i16 %b)
   %val = extractvalue { i16, i1 } %usub, 0
   %carry = extractvalue { i16, i1 } %usub, 1
-  store i16 %val, i16 addrspace(1)* %out
-  store i1 %carry, i1 addrspace(1)* %carryout
+  store i16 %val, ptr addrspace(1) %out
+  store i1 %carry, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -164,22 +164,22 @@ define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)*
 ; SICIVI: v_cndmask_b32
 ; SICIVI: v_sub_{{[iu]}}32
 ; SICIVI: v_cndmask_b32
-define amdgpu_kernel void @v_usubo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %carryout, <2 x i32> addrspace(1)* %aptr, <2 x i32> addrspace(1)* %bptr) nounwind {
-  %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
-  %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
+  %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
+  %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
   %sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
   %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
   %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
-  store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
+  store <2 x i32> %val, ptr addrspace(1) %out, align 4
   %carry.ext = zext <2 x i1> %carry to <2 x i32>
-  store <2 x i32> %carry.ext, <2 x i32> addrspace(1)* %carryout
+  store <2 x i32> %carry.ext, ptr addrspace(1) %carryout
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_usubo_clamp_bit:
 ; GCN: v_sub_{{i|u|co_u}}32_e32
 ; GCN: s_endpgm
-define amdgpu_kernel void @s_usubo_clamp_bit(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
 entry:
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
@@ -193,8 +193,8 @@ if:
 
 exit:
   %cout = phi i1 [false, %entry], [%c2, %if]
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %cout, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %cout, ptr addrspace(1) %carryout
   ret void
 }
 
@@ -202,14 +202,14 @@ exit:
 ; FUNC-LABEL: {{^}}v_usubo_clamp_bit:
 ; GCN: v_sub_{{i|u|co_u}}32_e64
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_usubo_clamp_bit(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
-  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
-  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
-  %a = load i32, i32 addrspace(1)* %a.gep, align 4
-  %b = load i32, i32 addrspace(1)* %b.gep, align 4
+  %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
+  %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
+  %a = load i32, ptr addrspace(1) %a.gep, align 4
+  %b = load i32, ptr addrspace(1) %b.gep, align 4
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -222,8 +222,8 @@ if:
 
 exit:
   %cout = phi i1 [false, %entry], [%c2, %if]
-  store i32 %val, i32 addrspace(1)* %out, align 4
-  store i1 %cout, i1 addrspace(1)* %carryout
+  store i32 %val, ptr addrspace(1) %out, align 4
+  store i1 %cout, ptr addrspace(1) %carryout
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/v1024.ll b/llvm/test/CodeGen/AMDGPU/v1024.ll
index 1326ba437f94..47e2904f19af 100644
--- a/llvm/test/CodeGen/AMDGPU/v1024.ll
+++ b/llvm/test/CodeGen/AMDGPU/v1024.ll
@@ -9,23 +9,22 @@
 define amdgpu_kernel void @test_v1024() {
 entry:
   %alloca = alloca <32 x i32>, align 16, addrspace(5)
-  %cast = bitcast <32 x i32> addrspace(5)* %alloca to i8 addrspace(5)*
-  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %cast, i8 0, i32 128, i1 false)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 0, i32 128, i1 false)
   br i1 undef, label %if.then.i.i, label %if.else.i
 
 if.then.i.i:                                      ; preds = %entry
-  call void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* align 16 %cast, i8 addrspace(5)* align 4 undef, i64 128, i1 false)
+  call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 16 %alloca, ptr addrspace(5) align 4 undef, i64 128, i1 false)
   br label %if.then.i62.i
 
 if.else.i:                                        ; preds = %entry
   br label %if.then.i62.i
 
 if.then.i62.i:                                    ; preds = %if.else.i, %if.then.i.i
-  call void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* align 4 undef, i8 addrspace(5)* align 16 %cast, i64 128, i1 false)
+  call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 undef, ptr addrspace(5) align 16 %alloca, i64 128, i1 false)
   ret void
 }
 
-declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture readonly, i8, i32, i1 immarg)
-declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg)
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture readonly, i8, i32, i1 immarg)
+declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg)
 
-declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg)
+declare void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg)

diff  --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index 3867e6d65728..78c13fe79aad 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -15,13 +15,13 @@ declare double @llvm.fabs.f64(double)
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
+define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
-  %f = load float, float addrspace(1)* %f.gep
+  %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
+  %f = load float, ptr addrspace(1) %f.gep
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
-  store float %select, float addrspace(1)* %out
+  store float %select, ptr addrspace(1) %out
   ret void
 }
 
@@ -41,10 +41,10 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, fl
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
+define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 {
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
-  store float %select, float addrspace(1)* %out
+  store float %select, ptr addrspace(1) %out
   ret void
 }
 
@@ -60,13 +60,13 @@ define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f)
 ; SIVI-DAG:  v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
 ; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], [[CC]]
 ; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, s[[Z]], [[CC]]
-define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
   %setcc = fcmp one float %x, 0.0
   %select = select i1 %setcc, float 1.0, float %z
-  store float %select, float addrspace(1)* %out.gep
+  store float %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -77,13 +77,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)*
 ; SIVI-DAG:  v_mov_b32_e32 [[VX:v[0-9]+]], [[X]]
 ; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VX]], [[CC]]
 ; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[X]], [[CC]]
-define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
   %setcc = fcmp one float %x, 0.0
   %select = select i1 %setcc, float 1.0, float %x
-  store float %select, float addrspace(1)* %out.gep
+  store float %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -94,13 +94,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)*
 ; SIVI-DAG:  v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
 ; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], [[CC]]
 ; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 0, s[[Z]], [[CC]]
-define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
   %setcc = fcmp one float %x, 0.0
   %select = select i1 %setcc, float 0.0, float %z
-  store float %select, float addrspace(1)* %out.gep
+  store float %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -111,13 +111,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)*
 ; SIVI-DAG:  v_mov_b32_e32 [[VX:v[0-9]+]], [[X]]
 ; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VX]], [[CC]]
 ; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 0, [[X]], [[CC]]
-define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
   %setcc = fcmp one float %x, 0.0
   %select = select i1 %setcc, float 0.0, float %x
-  store float %select, float addrspace(1)* %out.gep
+  store float %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -126,15 +126,15 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)*
 ; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]]
 ; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]]
-define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
-  %z = load float, float addrspace(1)* %z.gep
+  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %z = load float, ptr addrspace(1) %z.gep
   %setcc = fcmp one float %x, 0.0
   %select = select i1 %setcc, float 0.0, float %z
-  store float %select, float addrspace(1)* %out.gep
+  store float %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -143,15 +143,15 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)*
 ; GCN-DAG: s_load_{{dword|b32}} [[X:s[0-9]+]]
 ; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]]
-define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
-  %z = load float, float addrspace(1)* %z.gep
+  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %z = load float, ptr addrspace(1) %z.gep
   %setcc = fcmp one float %x, 0.0
   %select = select i1 %setcc, float 1.0, float %z
-  store float %select, float addrspace(1)* %out.gep
+  store float %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -162,15 +162,15 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)*
 ; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; SIVI:     v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
 ; GFX10:    v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[Z]], vcc
-define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
-  %x = load float, float addrspace(1)* %x.gep
+  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load float, ptr addrspace(1) %x.gep
   %setcc = fcmp olt float %x, 0.0
   %select = select i1 %setcc, float 1.0, float %z
-  store float %select, float addrspace(1)* %out.gep
+  store float %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -179,17 +179,17 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]]
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc
-define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
-  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile float, float addrspace(1)* %x.gep
-  %z = load volatile float, float addrspace(1)* %z.gep
+  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile float, ptr addrspace(1) %x.gep
+  %z = load volatile float, ptr addrspace(1) %z.gep
   %setcc = fcmp ult float %x, 0.0
   %select = select i1 %setcc, float 1.0, float %z
-  store float %select, float addrspace(1)* %out.gep
+  store float %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -198,17 +198,17 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)*
 ; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]]
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc
-define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
-  %z.gep = getelementptr inbounds i32, i32 addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile i32, i32 addrspace(1)* %x.gep
-  %z = load volatile i32, i32 addrspace(1)* %z.gep
+  %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %z.gep = getelementptr inbounds i32, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile i32, ptr addrspace(1) %x.gep
+  %z = load volatile i32, ptr addrspace(1) %z.gep
   %setcc = icmp slt i32 %x, 0
   %select = select i1 %setcc, i32 2, i32 %z
-  store i32 %select, i32 addrspace(1)* %out.gep
+  store i32 %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -218,17 +218,17 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %
 ; GCN-DAG: v_cmp_lt_i64_e32 vcc, -1, v[[[X_LO]]:[[X_HI]]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
-define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext
-  %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile i64, i64 addrspace(1)* %x.gep
-  %z = load volatile i64, i64 addrspace(1)* %z.gep
+  %x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile i64, ptr addrspace(1) %x.gep
+  %z = load volatile i64, ptr addrspace(1) %z.gep
   %setcc = icmp slt i64 %x, 0
   %select = select i1 %setcc, i64 2, i64 %z
-  store i64 %select, i64 addrspace(1)* %out.gep
+  store i64 %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -241,17 +241,17 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
-  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile float, float addrspace(1)* %x.gep
-  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
+  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile float, ptr addrspace(1) %x.gep
+  %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
   %setcc = fcmp ugt float %x, 4.0
   %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>
-  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
+  store <4 x float> %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -264,17 +264,17 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrs
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
-  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile float, float addrspace(1)* %x.gep
-  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
+  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile float, ptr addrspace(1) %x.gep
+  %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
   %setcc = fcmp ugt float %x, 4.0
   %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
-  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
+  store <4 x float> %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -290,17 +290,17 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrs
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
-  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile float, float addrspace(1)* %x.gep
-  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
+  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile float, ptr addrspace(1) %x.gep
+  %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
   %setcc = fcmp ugt float 4.0, %x
   %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
-  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
+  store <4 x float> %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -313,17 +313,17 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrs
 ; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
 ; GCN: store_{{byte|b8}}
-define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
-  %z.gep = getelementptr inbounds i1, i1 addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i1, i1 addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile i32, i32 addrspace(1)* %x.gep
-  %z = load volatile i1, i1 addrspace(1)* %z.gep
+  %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %z.gep = getelementptr inbounds i1, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i1, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile i32, ptr addrspace(1) %x.gep
+  %z = load volatile i1, ptr addrspace(1) %z.gep
   %setcc = icmp slt i32 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
-  store i1 %select, i1 addrspace(1)* %out.gep
+  store i1 %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -337,17 +337,17 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %ou
 ; SIVI-DAG:  v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc
 ; GFX10-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3ff00000, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
-  %z.gep = getelementptr inbounds double, double addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile float, float addrspace(1)* %x.gep
-  %z = load volatile double, double addrspace(1)* %z.gep
+  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %z.gep = getelementptr inbounds double, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile float, ptr addrspace(1) %x.gep
+  %z = load volatile double, ptr addrspace(1) %z.gep
   %setcc = fcmp ult float %x, 0.0
   %select = select i1 %setcc, double 1.0, double %z
-  store double %select, double addrspace(1)* %out.gep
+  store double %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -359,17 +359,17 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace
 ; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
-  %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile float, float addrspace(1)* %x.gep
-  %z = load volatile i64, i64 addrspace(1)* %z.gep
+  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile float, ptr addrspace(1) %x.gep
+  %z = load volatile i64, ptr addrspace(1) %z.gep
   %setcc = fcmp one float %x, 0.0
   %select = select i1 %setcc, i64 3, i64 %z
-  store i64 %select, i64 addrspace(1)* %out.gep
+  store i64 %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -380,17 +380,17 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)
 
 ; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc
-define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
-  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile i32, i32 addrspace(1)* %x.gep
-  %z = load volatile float, float addrspace(1)* %z.gep
+  %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile i32, ptr addrspace(1) %x.gep
+  %z = load volatile float, ptr addrspace(1) %z.gep
   %setcc = icmp ugt i32 %x, 1
   %select = select i1 %setcc, float 4.0, float %z
-  store float %select, float addrspace(1)* %out.gep
+  store float %select, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -402,19 +402,19 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(
 ; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc
-define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
-  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
-  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
-  %x = load volatile float, float addrspace(1)* %x.gep
-  %z = load volatile float, float addrspace(1)* %z.gep
+  %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
+  %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
+  %x = load volatile float, ptr addrspace(1) %x.gep
+  %z = load volatile float, ptr addrspace(1) %z.gep
   %setcc = fcmp ugt float 4.0, %x
   %select0 = select i1 %setcc, float -1.0, float %z
   %select1 = select i1 %setcc, float -2.0, float %z
-  store volatile float %select0, float addrspace(1)* %out.gep
-  store volatile float %select1, float addrspace(1)* %out.gep
+  store volatile float %select0, ptr addrspace(1) %out.gep
+  store volatile float %select1, ptr addrspace(1) %out.gep
   ret void
 }
 
@@ -422,44 +422,44 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float a
 
 ; GCN-LABEL: {{^}}v_cndmask_abs_neg_f16:
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
-define amdgpu_kernel void @v_cndmask_abs_neg_f16(half addrspace(1)* %out, i32 %c, half addrspace(1)* %fptr) #0 {
+define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %f.gep = getelementptr half, half addrspace(1)* %fptr, i32 %idx
-  %f = load half, half addrspace(1)* %f.gep
+  %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
+  %f = load half, ptr addrspace(1) %f.gep
   %f.abs = call half @llvm.fabs.f16(half %f)
   %f.neg = fneg half %f
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, half %f.abs, half %f.neg
-  store half %select, half addrspace(1)* %out
+  store half %select, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_cndmask_abs_neg_f32:
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, |v{{[0-9]+}}|,
-define amdgpu_kernel void @v_cndmask_abs_neg_f32(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
+define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
-  %f = load float, float addrspace(1)* %f.gep
+  %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
+  %f = load float, ptr addrspace(1) %f.gep
   %f.abs = call float @llvm.fabs.f32(float %f)
   %f.neg = fneg float %f
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, float %f.abs, float %f.neg
-  store float %select, float addrspace(1)* %out
+  store float %select, ptr addrspace(1) %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_cndmask_abs_neg_f64:
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
-define amdgpu_kernel void @v_cndmask_abs_neg_f64(double addrspace(1)* %out, i32 %c, double addrspace(1)* %fptr) #0 {
+define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %f.gep = getelementptr double, double addrspace(1)* %fptr, i32 %idx
-  %f = load double, double addrspace(1)* %f.gep
+  %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx
+  %f = load double, ptr addrspace(1) %f.gep
   %f.abs = call double @llvm.fabs.f64(double %f)
   %f.neg = fneg double %f
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, double %f.abs, double %f.neg
-  store double %select, double addrspace(1)* %out
+  store double %select, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll b/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
index 2cda52a8438a..a48969d7cb0f 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
@@ -5,33 +5,33 @@ declare i32 @llvm.amdgcn.cvt.pk.u8.f32(float, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_0:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 0, v{{[0-9]+}}
-define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_0(ptr addrspace(1) %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_1:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_1(ptr addrspace(1) %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %reg) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_2:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_2(ptr addrspace(1) %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %reg) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_3:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_3(ptr addrspace(1) %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 3, i32 %reg) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -40,20 +40,20 @@ define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define amdgpu_kernel void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_combine(ptr addrspace(1) %out, float %src, i32 %reg) {
   %result0 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
   %result1 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %result0) #0
   %result2 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %result1) #0
   %result3 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 3, i32 %result2) #0
-  store i32 %result3, i32 addrspace(1)* %out, align 4
+  store i32 %result3, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx(ptr addrspace(1) %out, float %src, i32 %idx, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 %idx, i32 %reg) #0
-  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %result, ptr addrspace(1) %out, align 4
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index 12a358c299c0..0ce2b1876b5e 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -24,7 +24,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI: [[FLOW_BB]]:
 ; SI-NEXT: s_andn2_saveexec_b64 [[SAVE2]], [[SAVE2]]
 ;
-define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
+define amdgpu_kernel void @test_if(i32 %b, ptr addrspace(1) %src, ptr addrspace(1) %dst) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   switch i32 %tid, label %default [
@@ -33,26 +33,26 @@ entry:
   ]
 
 case1:
-  %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
-  store i32 13, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx1 = getelementptr i32, ptr addrspace(1) %dst, i32 %b
+  store i32 13, ptr addrspace(1) %arrayidx1, align 4
   br label %end
 
 case2:
-  %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
-  store i32 17, i32 addrspace(1)* %arrayidx5, align 4
+  %arrayidx5 = getelementptr i32, ptr addrspace(1) %dst, i32 %b
+  store i32 17, ptr addrspace(1) %arrayidx5, align 4
   br label %end
 
 default:
   %cmp8 = icmp eq i32 %tid, 2
-  %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
+  %arrayidx10 = getelementptr i32, ptr addrspace(1) %dst, i32 %b
   br i1 %cmp8, label %if, label %else
 
 if:
-  store i32 19, i32 addrspace(1)* %arrayidx10, align 4
+  store i32 19, ptr addrspace(1) %arrayidx10, align 4
   br label %end
 
 else:
-  store i32 21, i32 addrspace(1)* %arrayidx10, align 4
+  store i32 21, ptr addrspace(1) %arrayidx10, align 4
   br label %end
 
 end:
@@ -69,14 +69,14 @@ end:
 
 ; SI-NEXT: {{^}}[[EXIT]]:
 ; SI: s_endpgm
-define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_if(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
   br i1 %is.0, label %then, label %exit
 
 then:
-  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
-  store i32 999, i32 addrspace(1)* %gep
+  %gep = getelementptr i32, ptr addrspace(1) %dst, i32 %tid
+  store i32 999, ptr addrspace(1) %gep
   br label %exit
 
 exit:
@@ -95,14 +95,14 @@ exit:
 
 ; SI-NEXT: {{^}}[[EXIT]]:
 ; SI: s_endpgm
-define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_if_ret_else_ret(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %is.0 = icmp ne i32 %tid, 0
   br i1 %is.0, label %then, label %exit
 
 then:
-  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
-  store i32 999, i32 addrspace(1)* %gep
+  %gep = getelementptr i32, ptr addrspace(1) %dst, i32 %tid
+  store i32 999, ptr addrspace(1) %gep
   ret void
 
 exit:
@@ -132,18 +132,18 @@ exit:
 
 ; SI-NEXT: {{^}}[[EXIT]]:
 ; SI: ds_write_b32
-define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %is.0 = icmp ne i32 %tid, 0
   br i1 %is.0, label %then, label %exit
 
 then:
-  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
-  store i32 999, i32 addrspace(1)* %gep
+  %gep = getelementptr i32, ptr addrspace(1) %dst, i32 %tid
+  store i32 999, ptr addrspace(1) %gep
   ret void
 
 exit:
-  store volatile i32 7, i32 addrspace(3)* undef
+  store volatile i32 7, ptr addrspace(3) undef
   ret void
 }
 
@@ -161,7 +161,7 @@ exit:
 ; SI: s_cbranch_scc1 [[LABEL_LOOP]]
 ; SI: [[LABEL_EXIT]]:
 ; SI: s_endpgm
-define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_loop(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
@@ -170,10 +170,10 @@ entry:
 
 loop:
   %i = phi i32 [%tid, %entry], [%i.inc, %loop]
-  %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i
-  %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i
-  %load = load i32, i32 addrspace(1)* %src
-  store i32 %load, i32 addrspace(1)* %gep.dst
+  %gep.src = getelementptr i32, ptr addrspace(1) %src, i32 %i
+  %gep.dst = getelementptr i32, ptr addrspace(1) %dst, i32 %i
+  %load = load i32, ptr addrspace(1) %src
+  store i32 %load, ptr addrspace(1) %gep.dst
   %i.inc = add nsw i32 %i, 1
   %cmp = icmp eq i32 %limit, %i.inc
   br i1 %cmp, label %exit, label %loop
@@ -220,12 +220,12 @@ exit:
 ; SI: [[LABEL_EXIT]]:
 ; SI-NOT: [[COND_STATE]]
 ; SI: s_endpgm
-define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
+define amdgpu_kernel void @multi_vcond_loop(ptr addrspace(1) noalias nocapture %arg, ptr addrspace(1) noalias nocapture readonly %arg1, ptr addrspace(1) noalias nocapture readonly %arg2, ptr addrspace(1) noalias nocapture readonly %arg3) #1 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp4 = sext i32 %tmp to i64
-  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
-  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
+  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg3, i64 %tmp4
+  %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4
   %tmp7 = icmp sgt i32 %tmp6, 0
   %tmp8 = sext i32 %tmp6 to i64
   br i1 %tmp7, label %bb10, label %bb26
@@ -233,10 +233,10 @@ bb:
 bb10:                                             ; preds = %bb, %bb20
   %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
   %tmp12 = add nsw i64 %tmp11, %tmp4
-  %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12
-  %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
-  %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12
-  %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
+  %tmp13 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp12
+  %tmp14 = load i32, ptr addrspace(1) %tmp13, align 4
+  %tmp15 = getelementptr inbounds i32, ptr addrspace(1) %arg2, i64 %tmp12
+  %tmp16 = load i32, ptr addrspace(1) %tmp15, align 4
   %tmp17 = icmp ne i32 %tmp14, -1
   %tmp18 = icmp ne i32 %tmp16, -1
   %tmp19 = and i1 %tmp17, %tmp18
@@ -244,8 +244,8 @@ bb10:                                             ; preds = %bb, %bb20
 
 bb20:                                             ; preds = %bb10
   %tmp21 = add nsw i32 %tmp16, %tmp14
-  %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12
-  store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
+  %tmp22 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp12
+  store i32 %tmp21, ptr addrspace(1) %tmp22, align 4
   %tmp23 = add nuw nsw i64 %tmp11, 1
   %tmp24 = icmp slt i64 %tmp23, %tmp8
   br i1 %tmp24, label %bb10, label %bb26

diff  --git a/llvm/test/CodeGen/AMDGPU/vop-shrink.ll b/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
index 1c2776ed4fca..42038cfbfc2b 100644
--- a/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
+++ b/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
@@ -8,22 +8,22 @@
 
 ; ModuleID = 'vop-shrink.ll'
 
-define amdgpu_kernel void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
+define amdgpu_kernel void @sub_rev(ptr addrspace(1) %out, <4 x i32> %sgpr, i32 %cond) {
 entry:
   %vgpr = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp = icmp eq i32 %cond, 0
   br i1 %tmp, label %if, label %else
 
 if:                                               ; preds = %entry
-  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
+  %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 1
   %tmp2 = extractelement <4 x i32> %sgpr, i32 1
-  store i32 %tmp2, i32 addrspace(1)* %out
+  store i32 %tmp2, ptr addrspace(1) %out
   br label %endif
 
 else:                                             ; preds = %entry
   %tmp3 = extractelement <4 x i32> %sgpr, i32 2
   %tmp4 = sub i32 %vgpr, %tmp3
-  store i32 %tmp4, i32 addrspace(1)* %out
+  store i32 %tmp4, ptr addrspace(1) %out
   br label %endif
 
 endif:                                            ; preds = %else, %if
@@ -35,12 +35,12 @@ endif:                                            ; preds = %else, %if
 
 ; FUNC-LABEL: {{^}}add_fold:
 ; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
-define amdgpu_kernel void @add_fold(float addrspace(1)* %out) {
+define amdgpu_kernel void @add_fold(ptr addrspace(1) %out) {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = uitofp i32 %tmp to float
   %tmp2 = fadd float %tmp1, 1.024000e+03
-  store float %tmp2, float addrspace(1)* %out
+  store float %tmp2, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 9ad95bb0ab6f..98f13357111b 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -9,13 +9,13 @@
 ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc_lo
 ; GFX1064: v_cmp_lt_i32_e32 vcc, 0, v{{[0-9]+}}
 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc{{$}}
-define amdgpu_kernel void @test_vopc_i32(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) {
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid
-  %load = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
+  %load = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp sgt i32 %load, 0
   %sel = select i1 %cmp, i32 1, i32 2
-  store i32 %sel, i32 addrspace(1)* %gep, align 4
+  store i32 %sel, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -24,13 +24,13 @@ define amdgpu_kernel void @test_vopc_i32(i32 addrspace(1)* %arg) {
 ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc_lo
 ; GFX1064: v_cmp_nge_f32_e32 vcc, 0, v{{[0-9]+}}
 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc{{$}}
-define amdgpu_kernel void @test_vopc_f32(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) {
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %lid
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
+  %load = load float, ptr addrspace(1) %gep, align 4
   %cmp = fcmp ugt float %load, 0.0
   %sel = select i1 %cmp, float 1.0, float 2.0
-  store float %sel, float addrspace(1)* %gep, align 4
+  store float %sel, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -48,14 +48,14 @@ define amdgpu_ps void @test_vopc_vcmp(float %x) {
 ; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]]
 ; GFX1064: v_cmp_le_f16_sdwa [[SC:vcc|s\[[0-9:]+\]]], {{[vs][0-9]+}}, v{{[0-9]+}} src0_sel:WORD_1 src1_sel:DWORD
 ; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]]
-define amdgpu_kernel void @test_vopc_2xf16(<2 x half> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) {
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i32 %lid
-  %load = load <2 x half>, <2 x half> addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %lid
+  %load = load <2 x half>, ptr addrspace(1) %gep, align 4
   %elt = extractelement <2 x half> %load, i32 1
   %cmp = fcmp ugt half %elt, 0.0
   %sel = select i1 %cmp, <2 x half> <half 1.0, half 1.0>, <2 x half> %load
-  store <2 x half> %sel, <2 x half> addrspace(1)* %gep, align 4
+  store <2 x half> %sel, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -64,11 +64,11 @@ define amdgpu_kernel void @test_vopc_2xf16(<2 x half> addrspace(1)* %arg) {
 ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]]
 ; GFX1064: v_cmp_class_f32_e64 [[C:vcc|s\[[0-9:]+\]]], s{{[0-9]+}}, 0x204
 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]]{{$}}
-define amdgpu_kernel void @test_vopc_class(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x)
   %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
   %ext = zext i1 %cmp to i32
-  store i32 %ext, i32 addrspace(1)* %out, align 4
+  store i32 %ext, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -78,10 +78,10 @@ define amdgpu_kernel void @test_vopc_class(i32 addrspace(1)* %out, float %x) #0
 
 ; GFX1064: v_cmp_neq_f16_e64 [[C:vcc|s\[[0-9:]+\]]], 0x7c00, s{{[0-9]+}}
 ; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c00, v{{[0-9]+}}, [[C]]{{$}}
-define amdgpu_kernel void @test_vcmp_vcnd_f16(half addrspace(1)* %out, half %x) #0 {
+define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 {
   %cmp = fcmp oeq half %x, 0x7FF0000000000000
   %sel = select i1 %cmp, half 1.0, half %x
-  store half %sel, half addrspace(1)* %out, align 2
+  store half %sel, ptr addrspace(1) %out, align 2
   ret void
 }
 
@@ -94,15 +94,15 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(half addrspace(1)* %out, half %x)
 ; GFX1064: v_cmp_nle_f32_e64 [[C2:s\[[0-9:]+\]]], 1.0, v{{[0-9]+}}
 ; GFX1064: s_and_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]]
 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, [[AND]]
-define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(float addrspace(1)* %arg) {
+define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) {
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %lid
-  %load = load float, float addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
+  %load = load float, ptr addrspace(1) %gep, align 4
   %cmp = fcmp ugt float %load, 0.0
   %cmp2 = fcmp ult float %load, 1.0
   %and = and i1 %cmp, %cmp2
   %sel = select i1 %and, float 1.0, float 2.0
-  store float %sel, float addrspace(1)* %gep, align 4
+  store float %sel, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -115,15 +115,15 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(float addrspace(1)* %arg) {
 ; GFX1064: v_cmp_gt_i32_e64 [[C2:s\[[0-9:]+\]]], 1, v{{[0-9]+}}
 ; GFX1064: s_xor_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]]
 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]]
-define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) {
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid
-  %load = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
+  %load = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp sgt i32 %load, 0
   %cmp2 = icmp slt i32 %load, 1
   %xor = xor i1 %cmp, %cmp2
   %sel = select i1 %xor, i32 1, i32 2
-  store i32 %sel, i32 addrspace(1)* %gep, align 4
+  store i32 %sel, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -136,15 +136,15 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(i32 addrspace(1)* %arg) {
 ; GFX1064: v_cmp_gt_u32_e64 [[C2:s\[[0-9:]+\]]], 2, v{{[0-9]+}}
 ; GFX1064: s_or_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]]
 ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]]
-define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(i32 addrspace(1)* %arg) {
+define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) {
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid
-  %load = load i32, i32 addrspace(1)* %gep, align 4
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
+  %load = load i32, ptr addrspace(1) %gep, align 4
   %cmp = icmp ugt i32 %load, 3
   %cmp2 = icmp ult i32 %load, 2
   %or = or i1 %cmp, %cmp2
   %sel = select i1 %or, i32 1, i32 2
-  store i32 %sel, i32 addrspace(1)* %gep, align 4
+  store i32 %sel, ptr addrspace(1) %gep, align 4
   ret void
 }
 
@@ -152,13 +152,13 @@ define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(i32 addrspace(1)* %arg) {
 ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo
 ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}}
 ; GCN: s_cbranch_execz
-define amdgpu_kernel void @test_mask_if(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 {
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %cmp = icmp ugt i32 %lid, 10
   br i1 %cmp, label %if, label %endif
 
 if:
-  store i32 0, i32 addrspace(1)* %arg, align 4
+  store i32 0, ptr addrspace(1) %arg, align 4
   br label %endif
 
 endif:
@@ -189,7 +189,7 @@ endif:
 ; GCN:   ; %bb.{{[0-9]+}}:
 ; GCN:   .LBB{{.*}}:
 ; GCN:     s_endpgm
-define amdgpu_kernel void @test_loop_with_if(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb2
@@ -204,13 +204,13 @@ bb2:
 
 bb5:
   %tmp6 = sext i32 %tmp3 to i64
-  %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
-  %tmp8 = load i32, i32 addrspace(1)* %tmp7, align 4
+  %tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6
+  %tmp8 = load i32, ptr addrspace(1) %tmp7, align 4
   %tmp9 = icmp sgt i32 %tmp8, 10
   br i1 %tmp9, label %bb10, label %bb11
 
 bb10:
-  store i32 %tmp, i32 addrspace(1)* %tmp7, align 4
+  store i32 %tmp, ptr addrspace(1) %tmp7, align 4
   br label %bb13
 
 bb11:
@@ -255,7 +255,7 @@ bb13:
 ; GCN-DAG: global_load_dword [[LOAD:v[0-9]+]]
 ; GFX1032: v_cmp_gt_i32_e32 vcc_lo, 11, [[LOAD]]
 ; GFX1064: v_cmp_gt_i32_e32 vcc, 11, [[LOAD]]
-define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = icmp eq i32 %tmp, 0
@@ -267,13 +267,13 @@ bb:
 bb2:
   %tmp3 = phi i32 [ %tmp9, %bb8 ], [ 0, %.preheader ]
   %tmp4 = zext i32 %tmp3 to i64
-  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
-  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
+  %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp4
+  %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4
   %tmp7 = icmp sgt i32 %tmp6, 10
   br i1 %tmp7, label %bb8, label %.loopexit
 
 bb8:
-  store i32 %tmp, i32 addrspace(1)* %tmp5, align 4
+  store i32 %tmp, ptr addrspace(1) %tmp5, align 4
   %tmp9 = add nuw nsw i32 %tmp3, 1
   %tmp10 = icmp ult i32 %tmp9, 256
   %tmp11 = icmp ult i32 %tmp9, %tmp
@@ -289,13 +289,13 @@ bb8:
 ; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}, vcc_lo
 ; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, s{{[0-9]+}}
 ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}}
-define amdgpu_kernel void @test_addc_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 {
+define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
-  %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8
+  %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
+  %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
   %tmp5 = add nsw i64 %tmp4, %arg1
-  store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8
+  store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
   ret void
 }
 
@@ -304,13 +304,13 @@ bb:
 ; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}}
 ; GFX1064: v_sub_co_u32 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], v{{[0-9]+}}, s{{[0-9]+}}{{$}}
 ; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}}
-define amdgpu_kernel void @test_subbrev_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 {
+define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
-  %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8
+  %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
+  %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
   %tmp5 = sub nsw i64 %tmp4, %arg1
-  store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8
+  store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
   ret void
 }
 
@@ -319,13 +319,13 @@ bb:
 ; GFX1032: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}}
 ; GFX1064: v_sub_co_u32 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 ; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}}
-define amdgpu_kernel void @test_subb_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 {
+define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp
-  %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8
+  %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp
+  %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8
   %tmp5 = sub nsw i64 %arg1, %tmp4
-  store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8
+  store i64 %tmp5, ptr addrspace(1) %tmp3, align 8
   ret void
 }
 
@@ -339,48 +339,48 @@ bb:
 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_addc_u32 s{{[0-9]+}}, 0, s{{[0-9]+}}
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 bb:
-  %tmp = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 1
-  %tmp1 = load i64, i64 addrspace(1)* %tmp, align 8
-  %tmp2 = load i64, i64 addrspace(1)* %arg, align 8
+  %tmp = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 1
+  %tmp1 = load i64, ptr addrspace(1) %tmp, align 8
+  %tmp2 = load i64, ptr addrspace(1) %arg, align 8
   %tmp3 = udiv i64 %tmp1, %tmp2
-  %tmp4 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 2
-  store i64 %tmp3, i64 addrspace(1)* %tmp4, align 8
+  %tmp4 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 2
+  store i64 %tmp3, ptr addrspace(1) %tmp4, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_div_scale_f32:
 ; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_div_scale_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile float, float addrspace(1)* %gep.0, align 4
-  %b = load volatile float, float addrspace(1)* %gep.1, align 4
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
-  store float %result0, float addrspace(1)* %out, align 4
+  store float %result0, ptr addrspace(1) %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_div_scale_f64:
 ; GFX1032: v_div_scale_f64 v[{{[0-9:]+}}], s{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
 ; GFX1064: v_div_scale_f64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
-define amdgpu_kernel void @test_div_scale_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
-  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
 
-  %a = load volatile double, double addrspace(1)* %gep.0, align 8
-  %b = load volatile double, double addrspace(1)* %gep.1, align 8
+  %a = load volatile double, ptr addrspace(1) %gep.0, align 8
+  %b = load volatile double, ptr addrspace(1) %gep.1, align 8
 
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
-  store double %result0, double addrspace(1)* %out, align 8
+  store double %result0, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -411,9 +411,9 @@ define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX1032: s_cselect_b32 vcc_lo, -1, 0
 ; GFX1064: s_cselect_b64 vcc, -1, 0
 ; GCN:     v_div_fmas_f32 v{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
-  store float %result, float addrspace(1)* %out, align 4
+  store float %result, ptr addrspace(1) %out, align 4
   ret void
 }
 
@@ -422,9 +422,9 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a,
 ; GFX1032: s_cselect_b32 vcc_lo, -1, 0
 ; GFX1064: s_cselect_b64 vcc, -1, 0
 ; GCN-DAG: v_div_fmas_f64 v[{{[0-9:]+}}], {{[vs]}}[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
-define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind {
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
-  store double %result, double addrspace(1)* %out, align 8
+  store double %result, ptr addrspace(1) %out, align 8
   ret void
 }
 
@@ -441,30 +441,30 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %
 ; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE]]
 ; GFX1064: s_or_b64 exec, exec, [[SAVE]]
 ; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) #0 {
+define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
-  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
-  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
-  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 2
+  %gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1
+  %gep.c = getelementptr float, ptr addrspace(1) %gep.a, i32 2
 
-  %a = load float, float addrspace(1)* %gep.a
-  %b = load float, float addrspace(1)* %gep.b
-  %c = load float, float addrspace(1)* %gep.c
+  %a = load float, ptr addrspace(1) %gep.a
+  %b = load float, ptr addrspace(1) %gep.b
+  %c = load float, ptr addrspace(1) %gep.c
 
   %cmp0 = icmp eq i32 %tid, 0
   br i1 %cmp0, label %bb, label %exit
 
 bb:
-  %val = load volatile i32, i32 addrspace(1)* %dummy
+  %val = load volatile i32, ptr addrspace(1) %dummy
   %cmp1 = icmp ne i32 %val, 0
   br label %exit
 
 exit:
   %cond = phi i1 [false, %entry], [%cmp1, %bb]
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone
-  store float %result, float addrspace(1)* %gep.out, align 4
+  store float %result, ptr addrspace(1) %gep.out, align 4
   ret void
 }
 
@@ -477,10 +477,10 @@ exit:
 
 ; GCN-NOT: vcc
 ; GCN: v_div_fmas_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b
-  store float %fdiv, float addrspace(1)* %out
+  store float %fdiv, ptr addrspace(1) %out
   ret void
 }
 
@@ -489,33 +489,33 @@ entry:
 ; GFX1064:  v_cmp_nlt_f16_e32 vcc,
 ; GCN-NEXT: s_cbranch_vccnz
 define amdgpu_kernel void @test_br_cc_f16(
-    half addrspace(1)* %r,
-    half addrspace(1)* %a,
-    half addrspace(1)* %b) {
+    ptr addrspace(1) %r,
+    ptr addrspace(1) %a,
+    ptr addrspace(1) %b) {
 entry:
-  %a.val = load half, half addrspace(1)* %a
-  %b.val = load half, half addrspace(1)* %b
+  %a.val = load half, ptr addrspace(1) %a
+  %b.val = load half, ptr addrspace(1) %b
   %fcmp = fcmp olt half %a.val, %b.val
   br i1 %fcmp, label %one, label %two
 
 one:
-  store half %a.val, half addrspace(1)* %r
+  store half %a.val, ptr addrspace(1) %r
   ret void
 
 two:
-  store half %b.val, half addrspace(1)* %r
+  store half %b.val, ptr addrspace(1) %r
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_brcc_i1:
 ; GCN:      s_bitcmp0_b32 s{{[0-9]+}}, 0
 ; GCN-NEXT: s_cbranch_scc1
-define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
+define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 {
   %cmp0 = icmp ne i1 %val, 0
   br i1 %cmp0, label %store, label %end
 
 store:
-  store i32 222, i32 addrspace(1)* %out
+  store i32 222, ptr addrspace(1) %out
   ret void
 
 end:
@@ -549,7 +549,7 @@ bb0:
   br i1 %tmp9, label %bb1, label %bb2
 
 bb1:
-  store volatile i32 0, i32 addrspace(1)* undef
+  store volatile i32 0, ptr addrspace(1) undef
   br label %bb2
 
 bb2:
@@ -574,7 +574,7 @@ bb1:                                              ; preds = %Flow, %bb
   br i1 %cmp0, label %bb4, label %Flow
 
 bb4:                                              ; preds = %bb1
-  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %load = load volatile i32, ptr addrspace(1) undef, align 4
   %cmp1 = icmp sge i32 %tmp, %load
   br label %Flow
 
@@ -584,7 +584,7 @@ Flow:                                             ; preds = %bb4, %bb1
   br i1 %tmp3, label %bb1, label %bb9
 
 bb9:                                              ; preds = %Flow
-  store volatile i32 7, i32 addrspace(3)* undef
+  store volatile i32 7, ptr addrspace(3) undef
   ret void
 }
 
@@ -601,12 +601,12 @@ bb9:                                              ; preds = %Flow
 ; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}, vcc
 ; GFX1064: v_cmp_ne_u32_e32 vcc, 3, v{{[0-9]+}}
 ; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
-define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) %out) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
   %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
-  store i32 %value, i32 addrspace(1)* %out
+  store i32 %value, ptr addrspace(1) %out
   ret void
 }
 
@@ -617,9 +617,9 @@ entry:
 ; GFX1064: s_not_b64 exec, exec{{$}}
 ; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 42
 ; GFX1064: s_not_b64 exec, exec{{$}}
-define amdgpu_kernel void @test_set_inactive(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 {
   %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42)
-  store i32 %tmp, i32 addrspace(1)* %out
+  store i32 %tmp, ptr addrspace(1) %out
   ret void
 }
 
@@ -632,9 +632,9 @@ define amdgpu_kernel void @test_set_inactive(i32 addrspace(1)* %out, i32 %in) #0
 ; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0
 ; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0
 ; GFX1064: s_not_b64 exec, exec{{$}}
-define amdgpu_kernel void @test_set_inactive_64(i64 addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 {
   %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
-  store i64 %tmp, i64 addrspace(1)* %out
+  store i64 %tmp, ptr addrspace(1) %out
   ret void
 }
 
@@ -821,10 +821,10 @@ main_body:
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
 ; GCN:         store_dwordx2 v{{[0-9]+}}, v[[[V_LO]]:[[V_HI]]], s
-define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -836,9 +836,9 @@ define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
 ; GCN:         store_dwordx2 v{{[0-9]+}}, v[[[V_LO]]:[[V_HI]]], s
-define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32)
-  store i64 %result, i64 addrspace(1)* %out
+  store i64 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -848,10 +848,10 @@ define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src)
 ; GFX1064:     v_cmp_eq_f32_e64 s[[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
 ; GCN:         store_dword v{{[0-9]+}}, v[[V_LO]], s
-define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -861,9 +861,9 @@ define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src
 ; GFX1064:     v_cmp_eq_u32_e64 s[[[C_LO:[0-9]+]]:{{[0-9]+}}], 0x64, {{s[0-9]+}}
 ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}}
 ; GCN:         store_dword v{{[0-9]+}}, v[[V_LO]], s
-define amdgpu_kernel void @test_intr_icmp_i32(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) {
   %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32)
-  store i32 %result, i32 addrspace(1)* %out
+  store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
@@ -921,9 +921,9 @@ define amdgpu_ps float @test_ps_live() #0 {
 ; GFX1032: s_and_b32 vcc_lo, exec_lo, [[C]]
 ; GFX1064: v_cmp_neq_f64_e64 [[C:s\[[0-9:]+\]]], s[{{[0-9:]+}}], 1.0
 ; GFX1064: s_and_b64 vcc, exec, [[C]]
-define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %v = load double, double addrspace(1)* %in
+  %v = load double, ptr addrspace(1) %in
   %cc = fcmp oeq double %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
 
@@ -933,7 +933,7 @@ if:
 
 endif:
   %r = phi double [ %v, %entry ], [ %u, %if ]
-  store double %r, double addrspace(1)* %out
+  store double %r, ptr addrspace(1) %out
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 7e612f53151c..1abdaf4853a5 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -150,7 +150,7 @@ main_body:
 }
 
 ; Check that WQM is re-enabled when required.
-define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
+define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
 ; GFX9-W64-LABEL: test4:
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
@@ -586,7 +586,7 @@ define amdgpu_ps float @test_wwm6_then() {
 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    ; return to shader part epilog
 main_body:
-  %src0 = load volatile float, float addrspace(1)* undef
+  %src0 = load volatile float, ptr addrspace(1) undef
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -594,7 +594,7 @@ main_body:
   br i1 %cc, label %endif, label %if
 
 if:
-  %src1 = load volatile float, float addrspace(1)* undef
+  %src1 = load volatile float, ptr addrspace(1) undef
   %out = fadd float %src0, %src1
   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
   br label %endif
@@ -667,7 +667,7 @@ define amdgpu_ps float @test_wwm6_loop() {
 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    ; return to shader part epilog
 main_body:
-  %src0 = load volatile float, float addrspace(1)* undef
+  %src0 = load volatile float, ptr addrspace(1) undef
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -675,7 +675,7 @@ main_body:
 
 loop:
   %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
-  %src1 = load volatile float, float addrspace(1)* undef
+  %src1 = load volatile float, ptr addrspace(1) undef
   %out = fadd float %src0, %src1
   %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
   %counter.1 = sub i32 %counter, 1
@@ -1058,7 +1058,7 @@ define amdgpu_ps float @test_strict_wqm6_then() {
 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    ; return to shader part epilog
 main_body:
-  %src0 = load volatile float, float addrspace(1)* undef
+  %src0 = load volatile float, ptr addrspace(1) undef
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -1066,7 +1066,7 @@ main_body:
   br i1 %cc, label %endif, label %if
 
 if:
-  %src1 = load volatile float, float addrspace(1)* undef
+  %src1 = load volatile float, ptr addrspace(1) undef
   %out = fadd float %src0, %src1
   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
   br label %endif
@@ -1145,7 +1145,7 @@ define amdgpu_ps float @test_strict_wqm6_loop() {
 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    ; return to shader part epilog
 main_body:
-  %src0 = load volatile float, float addrspace(1)* undef
+  %src0 = load volatile float, ptr addrspace(1) undef
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -1153,7 +1153,7 @@ main_body:
 
 loop:
   %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
-  %src1 = load volatile float, float addrspace(1)* undef
+  %src1 = load volatile float, ptr addrspace(1) undef
   %out = fadd float %src0, %src1
   %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
   %counter.1 = sub i32 %counter, 1
@@ -1623,7 +1623,7 @@ END:
 }
 
 ; Kill is performed in WQM mode so that uniform kill behaves correctly ...
-define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
+define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
 ; GFX9-W64-LABEL: test_kill_0:
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
@@ -2003,13 +2003,12 @@ entry:
 
   call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0)
 
-  %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0
-  store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4
+  store volatile i32 %a, ptr addrspace(5) %array, align 4
 
   call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0)
 
-  %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx
-  %c = load i32, i32 addrspace(5)* %c.gep, align 4
+  %c.gep = getelementptr [32 x i32], ptr addrspace(5) %array, i32 0, i32 %idx
+  %c = load i32, ptr addrspace(5) %c.gep, align 4
   %c.bc = bitcast i32 %c to float
   %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i32 0)
@@ -2103,7 +2102,7 @@ entry:
   br i1 %cc, label %if, label %else
 
 if:
-  store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
+  store volatile <4 x float> %dtex, ptr addrspace(1) undef
   unreachable
 
 else:
@@ -2590,7 +2589,7 @@ define amdgpu_ps float @test_strict_wwm6_then() {
 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    ; return to shader part epilog
 main_body:
-  %src0 = load volatile float, float addrspace(1)* undef
+  %src0 = load volatile float, ptr addrspace(1) undef
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -2598,7 +2597,7 @@ main_body:
   br i1 %cc, label %endif, label %if
 
 if:
-  %src1 = load volatile float, float addrspace(1)* undef
+  %src1 = load volatile float, ptr addrspace(1) undef
   %out = fadd float %src0, %src1
   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
   br label %endif
@@ -2667,7 +2666,7 @@ define amdgpu_ps float @test_strict_wwm6_loop() {
 ; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    ; return to shader part epilog
 main_body:
-  %src0 = load volatile float, float addrspace(1)* undef
+  %src0 = load volatile float, ptr addrspace(1) undef
   ; use mbcnt to make sure the branch is divergent
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
@@ -2675,7 +2674,7 @@ main_body:
 
 loop:
   %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
-  %src1 = load volatile float, float addrspace(1)* undef
+  %src1 = load volatile float, ptr addrspace(1) undef
   %out = fadd float %src0, %src1
   %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
   %counter.1 = sub i32 %counter, 1
@@ -3216,7 +3215,7 @@ main_body:
 
 ; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
 ; vector comparisons in Wave32 mode.
-define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(float addrspace(6)* inreg %0) {
+define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) inreg %0) {
 ; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
 ; GFX9-W64:       ; %bb.0: ; %main_body
 ; GFX9-W64-NEXT:    s_mov_b32 s3, 0x31016fac
@@ -3249,7 +3248,7 @@ define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(float addrspace(6)*
 ; GFX10-W32-NEXT:    exp null off, off, off, off done vm
 ; GFX10-W32-NEXT:    s_endpgm
 main_body:
-  %1 = ptrtoint float addrspace(6)* %0 to i32
+  %1 = ptrtoint ptr addrspace(6) %0 to i32
   %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
   %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
   %4 = fcmp nsz arcp ugt float %3, 0.000000e+00

diff  --git a/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
index 36532365d871..d06bc5387776 100644
--- a/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
+++ b/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
@@ -4,7 +4,7 @@
 ;CHECK: {{^}}fill3d:
 ;CHECK-NOT: MULLO_INT T[0-9]+
 
-define amdgpu_kernel void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @fill3d(ptr addrspace(1) nocapture %out) #0 {
 entry:
   %x.i = tail call i32 @llvm.r600.read.global.size.x() #1
   %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1
@@ -30,8 +30,8 @@ entry:
   %z.i8.i = tail call i32 @llvm.r600.read.tidig.z() #1
   %add.i = add i32 %z.i8.i, %mul33.i
   %add13 = add i32 %add.i, %add
-  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add13
-  store i32 %mul3, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %add13
+  store i32 %mul3, ptr addrspace(1) %arrayidx, align 4
   ret void
 }
 
@@ -78,4 +78,4 @@ attributes #1 = { nounwind readnone }
 
 !0 = !{null}
 !1 = !{null}
-!2 = !{void (i32 addrspace(1)*)* @fill3d}
+!2 = !{ptr @fill3d}